From 62fc15f009eba86e65f2f7448829429eae9ad071 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Tue, 25 Apr 2023 12:08:23 -0700 Subject: [PATCH] Block per key-value checksum (#11287) Summary: add option `block_protection_bytes_per_key` and implementation for block per key-value checksum. The main changes are 1. checksum construction and verification in block.cc/h 2. pass the option `block_protection_bytes_per_key` around (mainly for methods defined in table_cache.h) 3. unit tests/crash test updates Tests: * Added unit tests * Crash test: `python3 tools/db_crashtest.py blackbox --simple --block_protection_bytes_per_key=1 --write_buffer_size=1048576` Follow up (maybe as a separate PR): make sure corruption status returned from BlockIters are correctly handled. Performance: Turning on block per KV protection has a non-trivial negative impact on read performance and costs additional memory. For memory, each block includes additional 24 bytes for checksum-related states beside checksum itself. For CPU, I set up a DB of size ~1.2GB with 5M keys (32 bytes key and 200 bytes value) which compacts to ~5 SST files (target file size 256 MB) in L6 without compression. I tested readrandom performance with various block cache size (to mimic various cache hit rates): ``` SETUP make OPTIMIZE_LEVEL="-O3" USE_LTO=1 DEBUG_LEVEL=0 -j32 db_bench ./db_bench -benchmarks=fillseq,compact0,waitforcompaction,compact,waitforcompaction -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -target_file_size_base=268435456 --num=5000000 --key_size=32 --value_size=200 --compression_type=none BENCHMARK ./db_bench --use_existing_db -benchmarks=readtocache,readrandom[-X10] --num=5000000 --key_size=32 --disable_auto_compactions --reads=1000000 --block_protection_bytes_per_key=[0|1] --cache_size=$CACHESIZE The readrandom ops/sec looks like the following: Block cache size: 2GB 1.2GB * 0.9 1.2GB * 0.8 1.2GB * 0.5 8MB Main 240805 223604 198176 161653 139040 PR prot_bytes=0 238691 226693 200127 161082 141153 PR prot_bytes=1 214983 193199 178532 137013 108211 prot_bytes=1 vs -10% -15% -10.8% -15% -23% prot_bytes=0 ``` The benchmark has a lot of variance, but there was a 5% to 25% regression in this benchmark with different cache hit rates. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11287 Reviewed By: ajkr Differential Revision: D43970708 Pulled By: cbi42 fbshipit-source-id: ef98d898b71779846fa74212b9ec9e08b7183940 --- HISTORY.md | 1 + db/builder.cc | 3 +- db/column_family.cc | 6 + db/compaction/compaction_job.cc | 8 +- db/compaction/compaction_job_test.cc | 3 +- db/convenience.cc | 4 +- db/external_sst_file_ingestion_job.cc | 1 + db/forward_iterator.cc | 21 +- db/import_column_family_job.cc | 1 + db/kv_checksum.h | 98 +- db/memtable.cc | 60 +- db/memtable.h | 2 +- db/repair.cc | 12 +- db/table_cache.cc | 89 +- db/table_cache.h | 19 +- db/table_cache_sync_and_async.h | 3 +- db/version_builder.cc | 17 +- db/version_builder.h | 3 +- db/version_edit_handler.cc | 14 +- db/version_set.cc | 53 +- db/version_set_sync_and_async.h | 1 + db_stress_tool/db_stress_common.h | 1 + db_stress_tool/db_stress_gflags.cc | 5 + db_stress_tool/db_stress_test_base.cc | 1 + fuzz/sst_file_writer_fuzzer.cc | 3 +- include/rocksdb/advanced_options.h | 14 + options/cf_options.cc | 4 + options/cf_options.h | 3 + options/options_helper.cc | 2 + options/options_settable_test.cc | 3 +- table/block_based/block.cc | 222 ++++- table/block_based/block.h | 221 ++++- .../block_based/block_based_table_builder.cc | 7 +- .../block_based/block_based_table_factory.cc | 3 +- table/block_based/block_based_table_reader.cc | 13 +- table/block_based/block_based_table_reader.h | 1 + .../block_based_table_reader_test.cc | 5 +- table/block_based/block_cache.cc | 10 + table/block_based/block_cache.h | 16 +- table/block_based/block_test.cc | 920 +++++++++++++++++- .../block_based/data_block_hash_index_test.cc | 5 +- table/block_fetcher_test.cc | 6 +- table/sst_file_dumper.cc | 8 +- table/sst_file_reader.cc | 3 +- table/table_builder.h | 11 +- table/table_reader_bench.cc | 2 +- table/table_test.cc | 9 +- tools/db_bench_tool.cc | 6 + tools/db_crashtest.py | 1 + 49 files changed, 1695 insertions(+), 229 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 979a0eef4..7df785724 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,7 @@ # Rocksdb Change Log ## Unreleased ### New Features +* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287). ## 8.2.0 (04/24/2023) ### Public API Changes diff --git a/db/builder.cc b/db/builder.cc index be1ec29bf..eadc315c9 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -380,7 +380,8 @@ Status BuildTable( MaxFileSizeForL0MetaPin(mutable_cf_options), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key*/ nullptr, - /*allow_unprepared_value*/ false)); + /*allow_unprepared_value*/ false, + mutable_cf_options.block_protection_bytes_per_key)); s = it->status(); if (s.ok() && paranoid_file_checks) { OutputValidator file_validator(tboptions.internal_comparator, diff --git a/db/column_family.cc b/db/column_family.cc index 24ea46ac4..0b3fe6807 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1428,6 +1428,12 @@ Status ColumnFamilyData::ValidateOptions( "Memtable per key-value checksum protection only supports 0, 1, 2, 4 " "or 8 bytes per key."); } + if (std::find(supported.begin(), supported.end(), + cf_options.block_protection_bytes_per_key) == supported.end()) { + return Status::NotSupported( + "Block per key-value checksum protection only supports 0, 1, 2, 4 " + "or 8 bytes per key."); + } return s; } diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 8a326a508..ed152f28c 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -504,7 +504,9 @@ void CompactionJob::GenSubcompactionBoundaries() { FileMetaData* f = flevel->files[i].file_metadata; std::vector my_anchors; Status s = cfd->table_cache()->ApproximateKeyAnchors( - read_options, icomp, *f, my_anchors); + read_options, icomp, *f, + c->mutable_cf_options()->block_protection_bytes_per_key, + my_anchors); if (!s.ok() || my_anchors.empty()) { my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); } @@ -735,7 +737,9 @@ Status CompactionJob::Run() { *compact_->compaction->mutable_cf_options()), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); + /*allow_unprepared_value=*/false, + compact_->compaction->mutable_cf_options() + ->block_protection_bytes_per_key); auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 9c5784d5e..79f8e5fd5 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -454,7 +454,8 @@ class CompactionJobTestBase : public testing::Test { Status s = cf_options_.table_factory->NewTableReader( read_opts, TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(), - cfd_->internal_comparator()), + cfd_->internal_comparator(), + 0 /* block_protection_bytes_per_key */), std::move(freader), file_size, &table_reader, false); ASSERT_OK(s); assert(table_reader); diff --git a/db/convenience.cc b/db/convenience.cc index 8ab7cbc13..32cdfafaa 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -64,8 +64,8 @@ Status VerifySstFileChecksum(const Options& options, const bool kImmortal = true; auto reader_options = TableReaderOptions( ioptions, options.prefix_extractor, env_options, internal_comparator, - false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */, - -1 /* level */); + options.block_protection_bytes_per_key, false /* skip_filters */, + !kImmortal, false /* force_direct_prefetch */, -1 /* level */); reader_options.largest_seqno = largest_seqno; s = ioptions.table_factory->NewTableReader( reader_options, std::move(file_reader), file_size, &table_reader, diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 98bd6050a..ca9b6fb9b 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -678,6 +678,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, /*block_cache_tracer*/ nullptr, diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index eddade837..75a7c599b 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -36,7 +36,7 @@ class ForwardLevelIterator : public InternalIterator { const ColumnFamilyData* const cfd, const ReadOptions& read_options, const std::vector& files, const std::shared_ptr& prefix_extractor, - bool allow_unprepared_value) + bool allow_unprepared_value, uint8_t block_protection_bytes_per_key) : cfd_(cfd), read_options_(read_options), files_(files), @@ -45,7 +45,8 @@ class ForwardLevelIterator : public InternalIterator { file_iter_(nullptr), pinned_iters_mgr_(nullptr), prefix_extractor_(prefix_extractor), - allow_unprepared_value_(allow_unprepared_value) { + allow_unprepared_value_(allow_unprepared_value), + block_protection_bytes_per_key_(block_protection_bytes_per_key) { status_.PermitUncheckedError(); // Allow uninitialized status through } @@ -87,7 +88,8 @@ class ForwardLevelIterator : public InternalIterator { /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + block_protection_bytes_per_key_); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); valid_ = false; if (!range_del_agg.IsEmpty()) { @@ -211,6 +213,7 @@ class ForwardLevelIterator : public InternalIterator { // Kept alive by ForwardIterator::sv_->mutable_cf_options const std::shared_ptr& prefix_extractor_; const bool allow_unprepared_value_; + const uint8_t block_protection_bytes_per_key_; }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, @@ -738,7 +741,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key)); } BuildLevelIterators(vstorage, sv_); current_ = nullptr; @@ -819,7 +823,8 @@ void ForwardIterator::RenewIterators() { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(svnew->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + svnew->mutable_cf_options.block_protection_bytes_per_key)); } for (auto* f : l0_iters_) { @@ -863,7 +868,8 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage, } else { level_iters_.push_back(new ForwardLevelIterator( cfd_, read_options_, level_files, - sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_)); + sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_, + sv->mutable_cf_options.block_protection_bytes_per_key)); } } } @@ -885,7 +891,8 @@ void ForwardIterator::ResetIncompleteIterators() { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 12d2519e9..9a8b48dd0 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -250,6 +250,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, /*block_cache_tracer*/ nullptr, diff --git a/db/kv_checksum.h b/db/kv_checksum.h index bce507fcf..53c02485f 100644 --- a/db/kv_checksum.h +++ b/db/kv_checksum.h @@ -46,6 +46,8 @@ template class ProtectionInfoKVOC; template class ProtectionInfoKVOS; +template +class ProtectionInfoKV; // Aliases for 64-bit protection infos. using ProtectionInfo64 = ProtectionInfo; @@ -64,13 +66,13 @@ class ProtectionInfo { ProtectionInfoKVO ProtectKVO(const SliceParts& key, const SliceParts& value, ValueType op_type) const; - - T GetVal() const { return val_; } + ProtectionInfoKV ProtectKV(const Slice& key, const Slice& value) const; private: friend class ProtectionInfoKVO; friend class ProtectionInfoKVOS; friend class ProtectionInfoKVOC; + friend class ProtectionInfoKV; // Each field is hashed with an independent value so we can catch fields being // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall, @@ -89,8 +91,47 @@ class ProtectionInfo { static_assert(sizeof(ProtectionInfo) == sizeof(T), ""); } + T GetVal() const { return val_; } void SetVal(T val) { val_ = val; } + void Encode(uint8_t len, char* dst) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + dst[0] = static_cast(val_); + break; + case 2: + EncodeFixed16(dst, static_cast(val_)); + break; + case 4: + EncodeFixed32(dst, static_cast(val_)); + break; + case 8: + EncodeFixed64(dst, static_cast(val_)); + break; + default: + assert(false); + } + } + + bool Verify(uint8_t len, const char* checksum_ptr) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + return static_cast(checksum_ptr[0]) == + static_cast(val_); + case 2: + return DecodeFixed16(checksum_ptr) == static_cast(val_); + case 4: + return DecodeFixed32(checksum_ptr) == static_cast(val_); + case 8: + return DecodeFixed64(checksum_ptr) == static_cast(val_); + default: + assert(false); + return false; + } + } + T val_ = 0; }; @@ -113,7 +154,14 @@ class ProtectionInfoKVO { void UpdateV(const SliceParts& old_value, const SliceParts& new_value); void UpdateO(ValueType old_op_type, ValueType new_op_type); - T GetVal() const { return info_.GetVal(); } + // Encode this protection info into `len` bytes and stores them in `dst`. + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + // Verify this protection info against the protection info encoded by Encode() + // at the first `len` bytes of `checksum_ptr`. + // Returns true iff the verification is successful. + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfo; @@ -124,6 +172,7 @@ class ProtectionInfoKVO { static_assert(sizeof(ProtectionInfoKVO) == sizeof(T), ""); } + T GetVal() const { return info_.GetVal(); } void SetVal(T val) { info_.SetVal(val); } ProtectionInfo info_; @@ -154,7 +203,10 @@ class ProtectionInfoKVOC { void UpdateC(ColumnFamilyId old_column_family_id, ColumnFamilyId new_column_family_id); - T GetVal() const { return kvo_.GetVal(); } + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfoKVO; @@ -163,6 +215,7 @@ class ProtectionInfoKVOC { static_assert(sizeof(ProtectionInfoKVOC) == sizeof(T), ""); } + T GetVal() const { return kvo_.GetVal(); } void SetVal(T val) { kvo_.SetVal(val); } ProtectionInfoKVO kvo_; @@ -193,7 +246,10 @@ class ProtectionInfoKVOS { void UpdateS(SequenceNumber old_sequence_number, SequenceNumber new_sequence_number); - T GetVal() const { return kvo_.GetVal(); } + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfoKVO; @@ -202,11 +258,32 @@ class ProtectionInfoKVOS { static_assert(sizeof(ProtectionInfoKVOS) == sizeof(T), ""); } + T GetVal() const { return kvo_.GetVal(); } void SetVal(T val) { kvo_.SetVal(val); } ProtectionInfoKVO kvo_; }; +template +class ProtectionInfoKV { + public: + ProtectionInfoKV() = default; + + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } + + private: + friend class ProtectionInfo; + + explicit ProtectionInfoKV(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKV) == sizeof(T)); + } + + ProtectionInfo info_; +}; + template Status ProtectionInfo::GetStatus() const { if (val_ != 0) { @@ -244,6 +321,16 @@ ProtectionInfoKVO ProtectionInfo::ProtectKVO(const SliceParts& key, return ProtectionInfoKVO(val); } +template +ProtectionInfoKV ProtectionInfo::ProtectKV(const Slice& key, + const Slice& value) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + return ProtectionInfoKV(val); +} + template void ProtectionInfoKVO::UpdateK(const Slice& old_key, const Slice& new_key) { T val = GetVal(); @@ -394,5 +481,4 @@ void ProtectionInfoKVOS::UpdateS(SequenceNumber old_sequence_number, sizeof(new_sequence_number), ProtectionInfo::kSeedS)); SetVal(val); } - } // namespace ROCKSDB_NAMESPACE diff --git a/db/memtable.cc b/db/memtable.cc index e61ddc9db..f6c0cc624 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -256,7 +256,7 @@ void MemTable::UpdateOldestKeyTime() { } Status MemTable::VerifyEntryChecksum(const char* entry, - size_t protection_bytes_per_key, + uint32_t protection_bytes_per_key, bool allow_data_in_errors) { if (protection_bytes_per_key == 0) { return Status::OK(); @@ -285,28 +285,11 @@ Status MemTable::VerifyEntryChecksum(const char* entry, Slice value = Slice(value_ptr, value_length); const char* checksum_ptr = value_ptr + value_length; - uint64_t expected = ProtectionInfo64() - .ProtectKVO(user_key, value, type) - .ProtectS(seq) - .GetVal(); - bool match = true; - switch (protection_bytes_per_key) { - case 1: - match = static_cast(checksum_ptr[0]) == - static_cast(expected); - break; - case 2: - match = DecodeFixed16(checksum_ptr) == static_cast(expected); - break; - case 4: - match = DecodeFixed32(checksum_ptr) == static_cast(expected); - break; - case 8: - match = DecodeFixed64(checksum_ptr) == expected; - break; - default: - assert(false); - } + bool match = + ProtectionInfo64() + .ProtectKVO(user_key, value, type) + .ProtectS(seq) + .Verify(static_cast(protection_bytes_per_key), checksum_ptr); if (!match) { std::string msg( "Corrupted memtable entry, per key-value checksum verification " @@ -526,7 +509,7 @@ class MemTableIterator : public InternalIterator { bool valid_; bool arena_mode_; bool value_pinned_; - size_t protection_bytes_per_key_; + uint32_t protection_bytes_per_key_; Status status_; Logger* logger_; @@ -684,28 +667,15 @@ void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, return; } - uint64_t checksum = 0; if (kv_prot_info == nullptr) { - checksum = - ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal(); + ProtectionInfo64() + .ProtectKVO(key, value, type) + .ProtectS(s) + .Encode(static_cast(moptions_.protection_bytes_per_key), + checksum_ptr); } else { - checksum = kv_prot_info->GetVal(); - } - switch (moptions_.protection_bytes_per_key) { - case 1: - checksum_ptr[0] = static_cast(checksum); - break; - case 2: - EncodeFixed16(checksum_ptr, static_cast(checksum)); - break; - case 4: - EncodeFixed32(checksum_ptr, static_cast(checksum)); - break; - case 8: - EncodeFixed64(checksum_ptr, checksum); - break; - default: - assert(false); + kv_prot_info->Encode( + static_cast(moptions_.protection_bytes_per_key), checksum_ptr); } } @@ -902,7 +872,7 @@ struct Saver { ReadCallback* callback_; bool* is_blob_index; bool allow_data_in_errors; - size_t protection_bytes_per_key; + uint32_t protection_bytes_per_key; bool CheckCallback(SequenceNumber _seq) { if (callback_) { return callback_->IsVisible(_seq); diff --git a/db/memtable.h b/db/memtable.h index aa2ba87ca..eefabcf88 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -529,7 +529,7 @@ class MemTable { // Returns Corruption status if verification fails. static Status VerifyEntryChecksum(const char* entry, - size_t protection_bytes_per_key, + uint32_t protection_bytes_per_key, bool allow_data_in_errors = false); private: diff --git a/db/repair.cc b/db/repair.cc index b4b9d0c5f..633c348a5 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -518,8 +518,9 @@ class Repairer { if (status.ok()) { // TODO: plumb Env::IOActivity const ReadOptions read_options; - status = table_cache_->GetTableProperties(file_options_, read_options, - icmp_, t->meta, &props); + status = table_cache_->GetTableProperties( + file_options_, read_options, icmp_, t->meta, &props, + 0 /* block_protection_bytes_per_key */); } if (status.ok()) { auto s = @@ -577,7 +578,8 @@ class Repairer { /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); + /*allow_unprepared_value=*/false, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key); ParsedInternalKey parsed; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); @@ -617,7 +619,9 @@ class Repairer { ReadOptions ropts; std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( - ropts, cfd->internal_comparator(), t->meta, &r_iter); + ropts, cfd->internal_comparator(), t->meta, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key, + &r_iter); if (r_iter) { r_iter->SeekToFirst(); diff --git a/db/table_cache.cc b/db/table_cache.cc index 28206ed35..c288ec8c7 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -91,7 +91,8 @@ Status TableCache::GetTableReader( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats, - HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, const std::shared_ptr& prefix_extractor, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { @@ -140,7 +141,8 @@ Status TableCache::GetTableReader( s = ioptions_.table_factory->NewTableReader( ro, TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, skip_filters, immortal_tables_, + internal_comparator, block_protection_bytes_per_key, + skip_filters, immortal_tables_, false /* force_direct_prefetch */, level, block_cache_tracer_, max_file_size_for_l0_meta_pin, db_session_id_, file_meta.fd.GetNumber(), @@ -156,6 +158,7 @@ Status TableCache::FindTable( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, TypedHandle** handle, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, @@ -179,12 +182,12 @@ Status TableCache::FindTable( } std::unique_ptr table_reader; - Status s = - GetTableReader(ro, file_options, internal_comparator, file_meta, - false /* sequential mode */, record_read_stats, - file_read_hist, &table_reader, prefix_extractor, - skip_filters, level, prefetch_index_and_filter_in_cache, - max_file_size_for_l0_meta_pin, file_temperature); + Status s = GetTableReader(ro, file_options, internal_comparator, file_meta, + false /* sequential mode */, record_read_stats, + block_protection_bytes_per_key, file_read_hist, + &table_reader, prefix_extractor, skip_filters, + level, prefetch_index_and_filter_in_cache, + max_file_size_for_l0_meta_pin, file_temperature); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.stats, NO_FILE_ERRORS); @@ -212,6 +215,7 @@ InternalIterator* TableCache::NewIterator( size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t block_protection_bytes_per_key, TruncatedRangeDelIterator** range_del_iter) { PERF_TIMER_GUARD(new_table_iterator_nanos); @@ -225,12 +229,13 @@ InternalIterator* TableCache::NewIterator( auto& fd = file_meta.fd; table_reader = fd.table_reader; if (table_reader == nullptr) { - s = FindTable( - options, file_options, icomparator, file_meta, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record_read_stats */, file_read_hist, skip_filters, - level, true /* prefetch_index_and_filter_in_cache */, - max_file_size_for_l0_meta_pin, file_meta.temperature); + s = FindTable(options, file_options, icomparator, file_meta, &handle, + block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record_read_stats */, file_read_hist, + skip_filters, level, + true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { table_reader = cache_.Value(handle); } @@ -308,7 +313,7 @@ InternalIterator* TableCache::NewIterator( Status TableCache::GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, std::unique_ptr* out_iter) { assert(out_iter); const FileDescriptor& fd = file_meta.fd; @@ -317,7 +322,7 @@ Status TableCache::GetRangeTombstoneIterator( TypedHandle* handle = nullptr; if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle); + &handle, block_protection_bytes_per_key); if (s.ok()) { t = cache_.Value(handle); } @@ -403,6 +408,7 @@ Status TableCache::Get( const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin) { @@ -430,7 +436,7 @@ Status TableCache::Get( assert(s.ok()); if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, prefix_extractor, + &handle, block_protection_bytes_per_key, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, @@ -513,7 +519,8 @@ Status TableCache::MultiGetFilter( const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle) { + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key) { auto& fd = file_meta.fd; IterKey row_cache_key; std::string row_cache_entry_buffer; @@ -531,12 +538,13 @@ Status TableCache::MultiGetFilter( MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), mget_range->end()); if (t == nullptr) { - s = FindTable( - options, file_options_, internal_comparator, file_meta, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, /*skip_filters=*/false, - level, true /* prefetch_index_and_filter_in_cache */, - /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, + /*skip_filters=*/false, level, + true /* prefetch_index_and_filter_in_cache */, + /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); if (s.ok()) { t = cache_.Value(handle); } @@ -564,6 +572,7 @@ Status TableCache::GetTableProperties( const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, bool no_io) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? @@ -575,7 +584,8 @@ Status TableCache::GetTableProperties( TypedHandle* table_handle = nullptr; Status s = FindTable(read_options, file_options, internal_comparator, - file_meta, &table_handle, prefix_extractor, no_io); + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, no_io); if (!s.ok()) { return s; } @@ -588,12 +598,14 @@ Status TableCache::GetTableProperties( Status TableCache::ApproximateKeyAnchors( const ReadOptions& ro, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, std::vector& anchors) { + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + std::vector& anchors) { Status s; TableReader* t = file_meta.fd.table_reader; TypedHandle* handle = nullptr; if (t == nullptr) { - s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle); + s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle, + block_protection_bytes_per_key); if (s.ok()) { t = cache_.Value(handle); } @@ -610,7 +622,7 @@ Status TableCache::ApproximateKeyAnchors( size_t TableCache::GetMemoryUsageByTableReader( const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? @@ -620,7 +632,8 @@ size_t TableCache::GetMemoryUsageByTableReader( TypedHandle* table_handle = nullptr; Status s = FindTable(read_options, file_options, internal_comparator, - file_meta, &table_handle, prefix_extractor, true); + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, true /* no_io */); if (!s.ok()) { return 0; } @@ -639,16 +652,17 @@ uint64_t TableCache::ApproximateOffsetOf( const ReadOptions& read_options, const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); - Status s = - FindTable(read_options, file_options_, internal_comparator, file_meta, - &table_handle, prefix_extractor, false /* no_io */, - !for_compaction /* record_read_stats */); + Status s = FindTable( + read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, prefix_extractor, + false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { table_reader = cache_.Value(table_handle); } @@ -668,16 +682,17 @@ uint64_t TableCache::ApproximateSize( const ReadOptions& read_options, const Slice& start, const Slice& end, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); - Status s = - FindTable(read_options, file_options_, internal_comparator, file_meta, - &table_handle, prefix_extractor, false /* no_io */, - !for_compaction /* record_read_stats */); + Status s = FindTable( + read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, prefix_extractor, + false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { table_reader = cache_.Value(table_handle); } diff --git a/db/table_cache.h b/db/table_cache.h index 609e67498..41201eea8 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -96,6 +96,7 @@ class TableCache { size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t protection_bytes_per_key, TruncatedRangeDelIterator** range_del_iter = nullptr); // If a seek to internal key "k" in specified file finds an entry, @@ -112,6 +113,7 @@ class TableCache { const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1, size_t max_file_size_for_l0_meta_pin = 0); @@ -121,7 +123,7 @@ class TableCache { Status GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, std::unique_ptr* out_iter); // Call table reader's MultiGetFilter to use the bloom filter to filter out @@ -135,7 +137,8 @@ class TableCache { const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle); + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until @@ -150,6 +153,7 @@ class TableCache { Status, MultiGet, const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, bool skip_range_deletions = false, int level = -1, @@ -165,6 +169,7 @@ class TableCache { const ReadOptions& ro, const FileOptions& toptions, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, TypedHandle**, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, const bool no_io = false, bool record_read_stats = true, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, @@ -183,12 +188,14 @@ class TableCache { const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, bool no_io = false); Status ApproximateKeyAnchors(const ReadOptions& ro, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, + uint8_t block_protection_bytes_per_key, std::vector& anchors); // Return total memory usage of the table reader of the file. @@ -196,7 +203,7 @@ class TableCache { size_t GetMemoryUsageByTableReader( const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated offset of a key in a file represented by fd. @@ -204,6 +211,7 @@ class TableCache { const ReadOptions& read_options, const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated data size between start and end keys in a file @@ -212,6 +220,7 @@ class TableCache { const ReadOptions& read_options, const Slice& start, const Slice& end, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); CacheInterface& get_cache() { return cache_; } @@ -234,8 +243,8 @@ class TableCache { const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, bool sequential_mode, - bool record_read_stats, HistogramImpl* file_read_hist, - std::unique_ptr* table_reader, + bool record_read_stats, uint8_t block_protection_bytes_per_key, + HistogramImpl* file_read_hist, std::unique_ptr* table_reader, const std::shared_ptr& prefix_extractor = nullptr, bool skip_filters = false, int level = -1, bool prefetch_index_and_filter_in_cache = true, diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index b1ab73247..df8e9337f 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -17,6 +17,7 @@ namespace ROCKSDB_NAMESPACE { DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) (const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions, int level, TypedHandle* handle) { @@ -65,7 +66,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) if (t == nullptr) { assert(handle == nullptr); s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, prefix_extractor, + &handle, block_protection_bytes_per_key, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, diff --git a/db/version_builder.cc b/db/version_builder.cc index 64590db5c..d87ef9449 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1257,7 +1257,8 @@ class VersionBuilder::Rep { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { assert(table_cache_ != nullptr); size_t table_cache_capacity = @@ -1326,7 +1327,8 @@ class VersionBuilder::Rep { statuses[file_idx] = table_cache_->FindTable( read_options, file_options_, *(base_vstorage_->InternalComparator()), *file_meta, &handle, - prefix_extractor, false /*no_io */, true /* record_read_stats */, + block_protection_bytes_per_key, prefix_extractor, false /*no_io */, + true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, file_meta->temperature); @@ -1384,11 +1386,12 @@ Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { - return rep_->LoadTableHandlers(internal_stats, max_threads, - prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor, - max_file_size_for_l0_meta_pin, read_options); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { + return rep_->LoadTableHandlers( + internal_stats, max_threads, prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin, + read_options, block_protection_bytes_per_key); } uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { diff --git a/db/version_builder.h b/db/version_builder.h index 8e7dd9e66..fb2a304a8 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -48,7 +48,8 @@ class VersionBuilder { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key); uint64_t GetMinOldestBlobFileNumber() const; private: diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 7ea176e01..d507c4b0c 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -566,13 +566,13 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, assert(builder_iter->second != nullptr); VersionBuilder* builder = builder_iter->second->version_builder(); assert(builder); + const MutableCFOptions* moptions = cfd->GetLatestMutableCFOptions(); Status s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, prefetch_index_and_filter_in_cache, is_initial_load, - cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), - read_options_); + moptions->prefix_extractor, MaxFileSizeForL0MetaPin(*moptions), + read_options_, moptions->block_protection_bytes_per_key); if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { s = Status::OK(); } @@ -812,16 +812,16 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( assert(builder); } + const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions(); auto* version = new Version(cfd, version_set_, version_set_->file_options_, - *cfd->GetLatestMutableCFOptions(), io_tracer_, + *cf_opts_ptr, io_tracer_, version_set_->current_version_number_++, epoch_number_requirement_); s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, - cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), - read_options_); + cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr), + read_options_, cf_opts_ptr->block_protection_bytes_per_key); if (!s.ok()) { delete version; if (s.IsCorruption()) { diff --git a/db/version_set.cc b/db/version_set.cc index 9f1888c78..cfe8d6173 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -941,7 +941,7 @@ class LevelIterator final : public InternalIterator { const std::shared_ptr& prefix_extractor, bool should_sample, HistogramImpl* file_read_hist, TableReaderCaller caller, bool skip_filters, int level, - RangeDelAggregator* range_del_agg, + uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg, const std::vector* compaction_boundaries = nullptr, bool allow_unprepared_value = false, @@ -964,6 +964,7 @@ class LevelIterator final : public InternalIterator { pinned_iters_mgr_(nullptr), compaction_boundaries_(compaction_boundaries), is_next_read_sequential_(false), + block_protection_bytes_per_key_(block_protection_bytes_per_key), range_tombstone_iter_(nullptr), to_return_sentinel_(false) { // Empty level is not supported. @@ -1107,7 +1108,8 @@ class LevelIterator final : public InternalIterator { nullptr /* don't need reference to table */, file_read_hist_, caller_, /*arena=*/nullptr, skip_filters_, level_, /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, - largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_); + largest_compaction_key, allow_unprepared_value_, + block_protection_bytes_per_key_, range_tombstone_iter_); } // Check if current file being fully within iterate_lower_bound. @@ -1154,6 +1156,8 @@ class LevelIterator final : public InternalIterator { bool is_next_read_sequential_; + uint8_t block_protection_bytes_per_key_; + // This is set when this level iterator is used under a merging iterator // that processes range tombstones. range_tombstone_iter_ points to where the // merging iterator stores the range tombstones iterator for this level. When @@ -1535,6 +1539,7 @@ Status Version::GetTableProperties(const ReadOptions& read_options, auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor, true /* no io */); if (s.ok()) { return s; @@ -1621,6 +1626,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, Status s = table_cache->GetRangeTombstoneIterator( read_options, cfd_->internal_comparator(), *file_meta, + cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key, &tombstone_iter); if (!s.ok()) { return s; @@ -1739,6 +1745,7 @@ size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( file_options_, read_options, cfd_->internal_comparator(), *file_level.files[i].file_metadata, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor); } } @@ -1848,6 +1855,7 @@ InternalIterator* Version::TEST_GetLevelIterator( mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + mutable_cf_options_.block_protection_bytes_per_key, nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, allow_unprepared_value, &tombstone_iter_ptr); if (read_options.ignore_range_deletions) { @@ -1946,7 +1954,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, allow_unprepared_value, - &tombstone_iter); + mutable_cf_options_.block_protection_bytes_per_key, &tombstone_iter); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(table_iter); } else { @@ -1975,8 +1983,10 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr, - allow_unprepared_value, &tombstone_iter_ptr); + mutable_cf_options_.block_protection_bytes_per_key, + /*range_del_agg=*/nullptr, + /*compaction_boundaries=*/nullptr, allow_unprepared_value, + &tombstone_iter_ptr); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(level_iter); } else { @@ -2019,7 +2029,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false)); + /*allow_unprepared_value=*/false, + mutable_cf_options_.block_protection_bytes_per_key)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); if (!status.ok() || *overlap) { @@ -2034,7 +2045,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - &range_del_agg)); + mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg, + nullptr, false)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); } @@ -2333,7 +2345,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, StopWatchNano timer(clock_, timer_enabled /* auto_start */); *status = table_cache_->Get( read_options, *internal_comparator(), *f->file_metadata, ikey, - &get_context, mutable_cf_options_.prefix_extractor, + &get_context, mutable_cf_options_.block_protection_bytes_per_key, + mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), IsFilterSkipped(static_cast(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel()), @@ -2578,7 +2591,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, read_options, *internal_comparator(), *f->file_metadata, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle); + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); skip_range_deletions = true; if (status.ok()) { skip_filters = true; @@ -2768,7 +2782,8 @@ Status Version::ProcessBatch( read_options, *internal_comparator(), *f->file_metadata, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle); + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); if (status.ok()) { skip_filters = true; skip_range_deletions = true; @@ -5217,7 +5232,8 @@ Status VersionSet::ProcessManifestWrites( true /* prefetch_index_and_filter_in_cache */, false /* is_initial_load */, mutable_cf_options_ptrs[i]->prefix_extractor, - MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options); + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options, + mutable_cf_options_ptrs[i]->block_protection_bytes_per_key); if (!s.ok()) { if (db_options_->paranoid_checks) { break; @@ -6553,10 +6569,11 @@ uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, // "key" falls in the range for this table. Add the // approximate offset of "key" within the table. TableCache* table_cache = v->cfd_->table_cache(); + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( read_options, key, *f.file_metadata, caller, icmp, - v->GetMutableCFOptions().prefix_extractor); + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); } } return result; @@ -6596,9 +6613,10 @@ uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, if (table_cache == nullptr) { return 0; } + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); return table_cache->ApproximateSize( read_options, start, end, *f.file_metadata, caller, icmp, - v->GetMutableCFOptions().prefix_extractor); + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); } void VersionSet::RemoveLiveFiles( @@ -6757,6 +6775,7 @@ InternalIterator* VersionSet::MakeInputIterator( /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, /*allow_unprepared_value=*/false, + c->mutable_cf_options()->block_protection_bytes_per_key, /*range_del_iter=*/&range_tombstone_iter); range_tombstones.emplace_back(range_tombstone_iter, nullptr); } @@ -6770,8 +6789,9 @@ InternalIterator* VersionSet::MakeInputIterator( /*should_sample=*/false, /*no per level latency histogram=*/nullptr, TableReaderCaller::kCompaction, /*skip_filters=*/false, - /*level=*/static_cast(c->level(which)), range_del_agg, - c->boundaries(which), false, &tombstone_iter_ptr); + /*level=*/static_cast(c->level(which)), + c->mutable_cf_options()->block_protection_bytes_per_key, + range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr); range_tombstones.emplace_back(nullptr, tombstone_iter_ptr); } } @@ -7008,7 +7028,8 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, TableCache::TypedHandle* handle = nullptr; FileMetaData meta_copy = meta; status = table_cache->FindTable( - read_options, file_opts, *icmp, meta_copy, &handle, pe, + read_options, file_opts, *icmp, meta_copy, &handle, + cf_opts->block_protection_bytes_per_key, pe, /*no_io=*/false, /*record_read_stats=*/true, internal_stats->GetFileReadHist(level), false, level, /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, diff --git a/db/version_set_sync_and_async.h b/db/version_set_sync_and_async.h index 188c2e2f9..2507762e8 100644 --- a/db/version_set_sync_and_async.h +++ b/db/version_set_sync_and_async.h @@ -25,6 +25,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) StopWatchNano timer(clock_, timer_enabled /* auto_start */); s = CO_AWAIT(table_cache_->MultiGet)( read_options, *internal_comparator(), *f->file_metadata, &file_range, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters, skip_range_deletions, hit_file_level, table_handle); diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index d16fefe4c..8756932a7 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -290,6 +290,7 @@ DECLARE_bool(paranoid_file_checks); DECLARE_bool(fail_if_options_file_error); DECLARE_uint64(batch_protection_bytes_per_key); DECLARE_uint32(memtable_protection_bytes_per_key); +DECLARE_uint32(block_protection_bytes_per_key); DECLARE_uint64(user_timestamp_size); DECLARE_string(secondary_cache_uri); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index b6ee67269..9ce10f06c 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -975,6 +975,11 @@ DEFINE_uint32( "specified number of bytes per key. Currently the supported " "nonzero values are 1, 2, 4 and 8."); +DEFINE_uint32(block_protection_bytes_per_key, 0, + "If nonzero, enables integrity protection in blocks at the " + "specified number of bytes per key. Currently the supported " + "nonzero values are 1, 2, 4 and 8."); + DEFINE_string(file_checksum_impl, "none", "Name of an implementation for file_checksum_gen_factory, or " "\"none\" for null."); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 60a12b331..710c7687b 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -3122,6 +3122,7 @@ void InitializeOptionsFromFlags( FLAGS_verify_sst_unique_id_in_manifest; options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; + options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files; diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc index e93b9a3f5..676daf574 100644 --- a/fuzz/sst_file_writer_fuzzer.cc +++ b/fuzz/sst_file_writer_fuzzer.cc @@ -92,7 +92,8 @@ TableReader* NewTableReader(const std::string& sst_file_path, if (s.ok()) { ImmutableOptions iopts(options, cf_ioptions); TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options, - cf_ioptions.internal_comparator); + cf_ioptions.internal_comparator, + 0 /* block_protection_bytes_per_key */); t_opt.largest_seqno = kMaxSequenceNumber; s = options.table_factory->NewTableReader(t_opt, std::move(file_reader), file_size, &table_reader, diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 5862126d0..ff0a40895 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -1122,6 +1122,20 @@ struct AdvancedColumnFamilyOptions { // only compatible changes are allowed. bool persist_user_defined_timestamps = true; + // Enable/disable per key-value checksum protection for in memory blocks. + // + // Checksum is constructed when a block is loaded into memory and verification + // is done for each key read from the block. This is useful for detecting + // in-memory data corruption. Note that this feature has a non-trivial + // negative impact on read performance. Different values of the + // option have similar performance impact, but different memory cost and + // corruption detection probability (e.g. 1 byte gives 255/256 chance for + // detecting a corruption). + // + // Default: 0 (no protection) + // Supported values: 0, 1, 2, 4, 8. + uint8_t block_protection_bytes_per_key = 0; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/options/cf_options.cc b/options/cf_options.cc index 2057e300a..0fccd5014 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -488,6 +488,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, memtable_protection_bytes_per_key), OptionType::kUInt32T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"block_protection_bytes_per_key", + {offsetof(struct MutableCFOptions, block_protection_bytes_per_key), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {kOptNameCompOpts, OptionTypeInfo::Struct( kOptNameCompOpts, &compression_options_type_info, diff --git a/options/cf_options.h b/options/cf_options.h index d5e8da734..37ef54c0c 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -172,6 +172,7 @@ struct MutableCFOptions { : options.last_level_temperature), memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), + block_protection_bytes_per_key(options.block_protection_bytes_per_key), sample_for_compression( options.sample_for_compression), // TODO: is 0 fine here? compression_per_level(options.compression_per_level) { @@ -222,6 +223,7 @@ struct MutableCFOptions { bottommost_compression(kDisableCompressionOption), last_level_temperature(Temperature::kUnknown), memtable_protection_bytes_per_key(0), + block_protection_bytes_per_key(0), sample_for_compression(0) {} explicit MutableCFOptions(const Options& options); @@ -312,6 +314,7 @@ struct MutableCFOptions { CompressionOptions bottommost_compression_opts; Temperature last_level_temperature; uint32_t memtable_protection_bytes_per_key; + uint8_t block_protection_bytes_per_key; uint64_t sample_for_compression; std::vector compression_per_level; diff --git a/options/options_helper.cc b/options/options_helper.cc index fc651ffdb..abe5053d2 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -206,6 +206,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.experimental_mempurge_threshold; cf_opts->memtable_protection_bytes_per_key = moptions.memtable_protection_bytes_per_key; + cf_opts->block_protection_bytes_per_key = + moptions.block_protection_bytes_per_key; // Compaction related options cf_opts->disable_auto_compactions = moptions.disable_auto_compactions; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index c772c786c..6357b5e9e 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -552,7 +552,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "compaction=false;age_for_warm=1;};" "blob_cache=1M;" "memtable_protection_bytes_per_key=2;" - "persist_user_defined_timestamps=true;", + "persist_user_defined_timestamps=true;" + "block_protection_bytes_per_key=1;", new_options)); ASSERT_NE(new_options->blob_cache.get(), nullptr); diff --git a/table/block_based/block.cc b/table/block_based/block.cc index b9b5d6e7e..136275b6c 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -30,7 +30,7 @@ namespace ROCKSDB_NAMESPACE { // Helper routine: decode the next block entry starting at "p", // storing the number of shared key bytes, non_shared key bytes, // and the length of the value in "*shared", "*non_shared", and -// "*value_length", respectively. Will not derefence past "limit". +// "*value_length", respectively. Will not dereference past "limit". // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). @@ -137,17 +137,26 @@ struct DecodeEntryV4 { return DecodeKeyV4()(p, limit, shared, non_shared); } }; + void DataBlockIter::NextImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return; +#endif bool is_shared = false; ParseNextDataKey(&is_shared); + ++cur_entry_idx_; } void MetaBlockIter::NextImpl() { bool is_shared = false; ParseNextKey(&is_shared); + ++cur_entry_idx_; } -void IndexBlockIter::NextImpl() { ParseNextIndexKey(); } +void IndexBlockIter::NextImpl() { + ParseNextIndexKey(); + ++cur_entry_idx_; +} void IndexBlockIter::PrevImpl() { assert(Valid()); @@ -166,6 +175,7 @@ void IndexBlockIter::PrevImpl() { // Loop until end of current entry hits the start of original entry while (ParseNextIndexKey() && NextEntryOffset() < original) { } + --cur_entry_idx_; } void MetaBlockIter::PrevImpl() { @@ -187,6 +197,7 @@ void MetaBlockIter::PrevImpl() { while (ParseNextKey(&is_shared) && NextEntryOffset() < original) { } + --cur_entry_idx_; } // Similar to IndexBlockIter::PrevImpl but also caches the prev entries @@ -195,6 +206,7 @@ void DataBlockIter::PrevImpl() { assert(prev_entries_idx_ == -1 || static_cast(prev_entries_idx_) < prev_entries_.size()); + --cur_entry_idx_; // Check if we can use cached prev_entries_ if (prev_entries_idx_ > 0 && prev_entries_[prev_entries_idx_].offset == current_) { @@ -319,10 +331,10 @@ void MetaBlockIter::SeekImpl(const Slice& target) { // inclusive; AND // 2) the last key of this block has a greater user_key from seek_user_key // -// If the return value is TRUE, iter location has two possibilies: -// 1) If iter is valid, it is set to a location as if set by BinarySeek. In -// this case, it points to the first key with a larger user_key or a matching -// user_key with a seqno no greater than the seeking seqno. +// If the return value is TRUE, iter location has two possibilities: +// 1) If iter is valid, it is set to a location as if set by SeekImpl(target). +// In this case, it points to the first key with a larger user_key or a +// matching user_key with a seqno no greater than the seeking seqno. // 2) If the iter is invalid, it means that either all the user_key is less // than the seek_user_key, or the block ends with a matching user_key but // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno @@ -347,11 +359,11 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // boundary key: axy@50 (we make minimal assumption about a boundary key) // Block N+1: [axy@10, ... ] // - // If seek_key = axy@60, the search will starts from Block N. + // If seek_key = axy@60, the search will start from Block N. // Even if the user_key is not found in the hash map, the caller still // have to continue searching the next block. // - // In this case, we pretend the key is the the last restart interval. + // In this case, we pretend the key is in the last restart interval. // The while-loop below will search the last restart interval for the // key. It will stop at the first key that is larger than the seek_key, // or to the end of the block if no one is larger. @@ -364,12 +376,15 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { assert(restart_index < num_restarts_); SeekToRestartPoint(restart_index); current_ = GetRestartPoint(restart_index); + cur_entry_idx_ = + static_cast(restart_index * block_restart_interval_) - 1; uint32_t limit = restarts_; if (restart_index + 1 < num_restarts_) { limit = GetRestartPoint(restart_index + 1); } while (current_ < limit) { + ++cur_entry_idx_; bool shared; // Here we only linear seek the target key inside the restart interval. // If a key does not exist inside a restart interval, we avoid @@ -381,14 +396,20 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // we stop at the first potential matching user key. break; } + // If the loop exits due to CompareCurrentKey(target) >= 0, then current key + // exists, and its checksum verification will be done in UpdateKey() called + // in SeekForGet(). + // TODO(cbi): If this loop exits with current_ == restart_, per key-value + // checksum will not be verified in UpdateKey() since Valid() + // will return false. } if (current_ == restarts_) { - // Search reaches to the end of the block. There are three possibilites: - // 1) there is only one user_key match in the block (otherwise collsion). + // Search reaches to the end of the block. There are three possibilities: + // 1) there is only one user_key match in the block (otherwise collision). // the matching user_key resides in the last restart interval, and it // is the last key of the restart interval and of the block as well. - // ParseNextKey() skiped it as its [ type | seqno ] is smaller. + // ParseNextKey() skipped it as its [ type | seqno ] is smaller. // // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, // AND all existing user_keys in the restart interval are smaller than @@ -424,6 +445,9 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { } void IndexBlockIter::SeekImpl(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return; +#endif TEST_SYNC_POINT("IndexBlockIter::Seek:0"); PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet @@ -478,7 +502,9 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { - SeekToLastImpl(); + if (status_.ok()) { + SeekToLastImpl(); + } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); @@ -502,7 +528,9 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { - SeekToLastImpl(); + if (status_.ok()) { + SeekToLastImpl(); + } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); @@ -517,6 +545,7 @@ void DataBlockIter::SeekToFirstImpl() { SeekToRestartPoint(0); bool is_shared = false; ParseNextDataKey(&is_shared); + cur_entry_idx_ = 0; } void MetaBlockIter::SeekToFirstImpl() { @@ -526,15 +555,20 @@ void MetaBlockIter::SeekToFirstImpl() { SeekToRestartPoint(0); bool is_shared = false; ParseNextKey(&is_shared); + cur_entry_idx_ = 0; } void IndexBlockIter::SeekToFirstImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return; +#endif if (data_ == nullptr) { // Not init yet return; } status_ = Status::OK(); SeekToRestartPoint(0); ParseNextIndexKey(); + cur_entry_idx_ = 0; } void DataBlockIter::SeekToLastImpl() { @@ -543,8 +577,10 @@ void DataBlockIter::SeekToLastImpl() { } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { // Keep skipping + ++cur_entry_idx_; } } @@ -554,9 +590,13 @@ void MetaBlockIter::SeekToLastImpl() { } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; + assert(num_restarts_ >= 1); + cur_entry_idx_ = + static_cast((num_restarts_ - 1) * block_restart_interval_); while (ParseNextKey(&is_shared) && NextEntryOffset() < restarts_) { - // Keep skipping + // Will probably never reach here since restart_interval is always 1 + ++cur_entry_idx_; } } @@ -566,20 +606,12 @@ void IndexBlockIter::SeekToLastImpl() { } status_ = Status::OK(); SeekToRestartPoint(num_restarts_ - 1); + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { - // Keep skipping + ++cur_entry_idx_; } } -template -void BlockIter::CorruptionError() { - current_ = restarts_; - restart_index_ = num_restarts_; - status_ = Status::Corruption("bad entry in block"); - raw_key_.Clear(); - value_.clear(); -} - template template bool BlockIter::ParseNextKey(bool* is_shared) { @@ -666,12 +698,12 @@ bool IndexBlockIter::ParseNextIndexKey() { // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // ... // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) -// where, k is key, v is value, and its encoding is in parenthesis. +// where, k is key, v is value, and its encoding is in parentheses. // The format of each key is (shared_size, non_shared_size, shared, non_shared) // The format of each value, i.e., block handle, is (offset, size) whenever the // is_shared is false, which included the first entry in each restart point. -// Otherwise the format is delta-size = block handle size - size of last block -// handle. +// Otherwise, the format is delta-size = the size of current block - the size o +// last block. void IndexBlockIter::DecodeCurrentValue(bool is_shared) { Slice v(value_.data(), data_ + restarts_ - value_.data()); // Delta encoding is used if `shared` != 0. @@ -710,6 +742,7 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, // to follow it up with NextImpl() to position the iterator at the restart // key. SeekToRestartPoint(index); + cur_entry_idx_ = static_cast(index * block_restart_interval_) - 1; NextImpl(); if (!skip_linear_scan) { @@ -728,6 +761,8 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, while (true) { NextImpl(); if (!Valid()) { + // TODO(cbi): per key-value checksum will not be verified in UpdateKey() + // since Valid() will returns false. break; } if (current_ == max_offset) { @@ -976,6 +1011,7 @@ Block::~Block() { // This sync point can be re-enabled if RocksDB can control the // initialization order of any/all static options created by the user. // TEST_SYNC_POINT("Block::~Block"); + delete[] kv_checksum_; } Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, @@ -1035,6 +1071,126 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, } } +void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp) { + protection_bytes_per_key_ = 0; + if (protection_bytes_per_key > 0 && num_restarts_ > 0) { + // NewDataIterator() is called with protection_bytes_per_key_ = 0. + // This is intended since checksum is not constructed yet. + // + // We do not know global_seqno yet, so checksum computation and + // verification all assume global_seqno = 0. + std::unique_ptr iter{NewDataIterator( + raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */, + nullptr /* stats */, true /* block_contents_pinned */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + size_t i = 0; + iter->SeekToFirst(); + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They + // do not impact how the index block is parsed. During checksum + // construction/verification, we use the entire key buffer from + // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value + // checksum, and the content of this buffer do not change for different + // values of `global_seqno` or `key_includes_seq`. + std::unique_ptr iter{NewIndexIterator( + raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr, + nullptr /* Statistics */, true /* total_order_seek */, + index_has_first_key /* have_first_key */, false /* key_includes_seq */, + value_is_full, true /* block_contents_pinned */, + nullptr /* prefix_index */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->raw_value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeMetaIndexBlockProtectionInfo( + uint8_t protection_bytes_per_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + std::unique_ptr iter{ + NewMetaIterator(true /* block_contents_pinned */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { MetaBlockIter* iter = new MetaBlockIter(); if (size_ < 2 * sizeof(uint32_t)) { @@ -1045,7 +1201,8 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { iter->Invalidate(Status::OK()); } else { iter->Initialize(data_, restart_offset_, num_restarts_, - block_contents_pinned); + block_contents_pinned, protection_bytes_per_key_, + kv_checksum_, block_restart_interval_); } return iter; } @@ -1072,7 +1229,8 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, ret_iter->Initialize( raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, read_amp_bitmap_.get(), block_contents_pinned, - data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); + data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr, + protection_bytes_per_key_, kv_checksum_, block_restart_interval_); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ @@ -1108,8 +1266,9 @@ IndexBlockIter* Block::NewIndexIterator( total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, prefix_index_ptr, have_first_key, - key_includes_seq, value_is_full, - block_contents_pinned); + key_includes_seq, value_is_full, block_contents_pinned, + protection_bytes_per_key_, kv_checksum_, + block_restart_interval_); } return ret_iter; @@ -1125,6 +1284,7 @@ size_t Block::ApproximateMemoryUsage() const { if (read_amp_bitmap_) { usage += read_amp_bitmap_->ApproximateMemoryUsage(); } + usage += checksum_size_; return usage; } diff --git a/table/block_based/block.h b/table/block_based/block.h index dfbca8663..68b6906fa 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -14,6 +14,7 @@ #include #include +#include "db/kv_checksum.h" #include "db/pinned_iterators_manager.h" #include "port/malloc.h" #include "rocksdb/advanced_cache.h" @@ -240,6 +241,34 @@ class Block { // For TypedCacheInterface const Slice& ContentSlice() const { return contents_.data; } + // Initializes per key-value checksum protection. + // After this method is called, each DataBlockIterator returned + // by NewDataIterator will verify per key-value checksum for any key it read. + void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp); + + // Initializes per key-value checksum protection. + // After this method is called, each IndexBlockIterator returned + // by NewIndexIterator will verify per key-value checksum for any key it read. + // value_is_full and index_has_first_key are needed to be able to parse + // the index block content and construct checksums. + void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key); + + // Initializes per key-value checksum protection. + // After this method is called, each MetaBlockIter returned + // by NewMetaIterator will verify per key-value checksum for any key it read. + void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key); + + static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len, + const Slice& key, const Slice& value) { + ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr); + } + + const char* TEST_GetKVChecksum() const { return kv_checksum_; } + private: BlockContents contents_; const char* data_; // contents_.data.data() @@ -247,6 +276,11 @@ class Block { uint32_t restart_offset_; // Offset in data_ of restart array uint32_t num_restarts_; std::unique_ptr read_amp_bitmap_; + char* kv_checksum_{nullptr}; + uint32_t checksum_size_{0}; + // Used by block iterators to calculate current key index within a block + uint32_t block_restart_interval_{0}; + uint8_t protection_bytes_per_key_{0}; DataBlockHashIndex data_block_hash_index_; }; @@ -269,6 +303,14 @@ class Block { // `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These // "Impl" functions are responsible for positioning `raw_key_` but not // invoking `UpdateKey()`. +// +// Per key-value checksum is enabled if relevant states are passed in during +// `InitializeBase()`. The checksum verification is done in each call to +// UpdateKey() for the current key. Each subclass is responsible for keeping +// track of cur_entry_idx_, the index of the current key within the block. +// BlockIter uses this index to get the corresponding checksum for current key. +// Additional checksum verification may be done in subclasses if they read keys +// other than the key being processed in UpdateKey(). template class BlockIter : public InternalIteratorBase { public: @@ -286,9 +328,16 @@ class BlockIter : public InternalIteratorBase { Cleanable::Reset(); } - bool Valid() const override { return current_ < restarts_; } + bool Valid() const override { + // When status_ is not ok, iter should be invalid. + assert(status_.ok() || current_ >= restarts_); + return current_ < restarts_; + } virtual void SeekToFirst() override final { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; +#endif SeekToFirstImpl(); UpdateKey(); } @@ -325,6 +374,7 @@ class BlockIter : public InternalIteratorBase { } Status status() const override { return status_; } + Slice key() const override { assert(Valid()); return key_; @@ -337,10 +387,22 @@ class BlockIter : public InternalIteratorBase { (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); status_.PermitUncheckedError(); } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; + + bool TEST_Corrupt_Callback(const std::string& sync_point) { + bool corrupt = false; + TEST_SYNC_POINT_CALLBACK(sync_point, static_cast(&corrupt)); + + if (corrupt) { + CorruptionError(); + } + return corrupt; + } #endif bool IsKeyPinned() const override { @@ -377,27 +439,74 @@ class BlockIter : public InternalIteratorBase { Status status_; // Key to be exposed to users. Slice key_; + SequenceNumber global_seqno_; + + // Per key-value checksum related states + const char* kv_checksum_; + int32_t cur_entry_idx_; + uint32_t block_restart_interval_; + uint8_t protection_bytes_per_key_; + bool key_pinned_; // Whether the block data is guaranteed to outlive this iterator, and // as long as the cleanup functions are transferred to another class, // e.g. PinnableSlice, the pointer to the bytes will still be valid. bool block_contents_pinned_; - SequenceNumber global_seqno_; virtual void SeekToFirstImpl() = 0; virtual void SeekToLastImpl() = 0; virtual void SeekImpl(const Slice& target) = 0; virtual void SeekForPrevImpl(const Slice& target) = 0; virtual void NextImpl() = 0; - virtual void PrevImpl() = 0; + // Returns the restart interval of this block. + // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized. + virtual uint32_t GetRestartInterval() { + if (num_restarts_ <= 1 || data_ == nullptr) { + return 0; + } + SeekToFirstImpl(); + uint32_t end_index = GetRestartPoint(1); + uint32_t count = 1; + while (NextEntryOffset() < end_index && status_.ok()) { + assert(Valid()); + NextImpl(); + ++count; + } + return count; + } + + // Returns the number of keys in this block. + virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) { + if (num_restarts_ == 0 || data_ == nullptr) { + return 0; + } + uint32_t count = (num_restarts_ - 1) * block_restart_interval; + // Add number of keys from the last restart interval + SeekToRestartPoint(num_restarts_ - 1); + while (NextEntryOffset() < restarts_ && status_.ok()) { + NextImpl(); + ++count; + } + return count; + } + + // Stores whether the current key has a shared bytes with prev key in + // *is_shared. + // Sets raw_key_, value_ to the current parsed key and value. + // Sets restart_index_ to point to the restart interval that contains + // the current key. template inline bool ParseNextKey(bool* is_shared); + // protection_bytes_per_key, kv_checksum, and block_restart_interval + // are needed only for per kv checksum verification. void InitializeBase(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, bool block_contents_pinned) { + SequenceNumber global_seqno, bool block_contents_pinned, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid @@ -410,11 +519,41 @@ class BlockIter : public InternalIteratorBase { global_seqno_ = global_seqno; block_contents_pinned_ = block_contents_pinned; cache_handle_ = nullptr; + cur_entry_idx_ = -1; + protection_bytes_per_key_ = protection_bytes_per_key; + kv_checksum_ = kv_checksum; + block_restart_interval_ = block_restart_interval; + // Checksum related states are either all 0/nullptr or all non-zero. + // One exception is when num_restarts == 0, block_restart_interval can be 0 + // since we are not able to compute it. + assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || + (protection_bytes_per_key > 0 && kv_checksum != nullptr && + (block_restart_interval > 0 || num_restarts == 1))); + } + + void CorruptionError(const std::string& error_msg = "bad entry in block") { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption(error_msg); + raw_key_.Clear(); + value_.clear(); + } + + void PerKVChecksumCorruptionError() { + std::string error_msg{ + "Corrupted block entry: per key-value checksum verification " + "failed."}; + error_msg.append(" Offset: " + std::to_string(current_) + "."); + error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + "."); + CorruptionError(error_msg); } // Must be called every time a key is found that needs to be returned to user, // and may be called when no key is found (as a no-op). Updates `key_`, // `key_buf_`, and `key_pinned_` with info about the found key. + // Per key-value checksum verification is done if available for the key to be + // returned. Iterator is invalidated with corruption status if checksum + // verification fails. void UpdateKey() { key_buf_.Clear(); if (!Valid()) { @@ -433,6 +572,19 @@ class BlockIter : public InternalIteratorBase { key_ = key_buf_.GetInternalKey(); key_pinned_ = false; } + TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", + (void*)value_.data()); + TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", + &protection_bytes_per_key_); + if (protection_bytes_per_key_ > 0) { + if (!ProtectionInfo64() + .ProtectKV(raw_key_.GetKey(), value_) + .Verify( + protection_bytes_per_key_, + kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { + PerKVChecksumCorruptionError(); + } + } } // Returns the result of `Comparator::Compare()`, where the appropriate @@ -464,7 +616,7 @@ class BlockIter : public InternalIteratorBase { return static_cast((value_.data() + value_.size()) - data_); } - uint32_t GetRestartPoint(uint32_t index) { + uint32_t GetRestartPoint(uint32_t index) const { assert(index < num_restarts_); return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); } @@ -479,13 +631,20 @@ class BlockIter : public InternalIteratorBase { value_ = Slice(data_ + offset, 0); } - void CorruptionError(); - protected: template inline bool BinarySeek(const Slice& target, uint32_t* index, bool* is_index_key_result); + // Find the first key in restart interval `index` that is >= `target`. + // If there is no such key, iterator is positioned at the first key in + // restart interval `index + 1`. + // If is_index_key_result is true, it positions the iterator at the first key + // in this restart interval. + // Per key-value checksum verification is done for all keys scanned + // up to but not including the last key (the key that current_ points to + // when this function returns). This key's checksum is verified in + // UpdateKey(). void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, bool is_index_key_result); }; @@ -494,22 +653,17 @@ class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} - DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts, - uint32_t num_restarts, SequenceNumber global_seqno, - BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, - DataBlockHashIndex* data_block_hash_index) - : DataBlockIter() { - Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno, - read_amp_bitmap, block_contents_pinned, data_block_hash_index); - } void Initialize(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, - DataBlockHashIndex* data_block_hash_index) { + DataBlockHashIndex* data_block_hash_index, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno, - block_contents_pinned); + block_contents_pinned, protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(false); read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; @@ -527,7 +681,11 @@ class DataBlockIter final : public BlockIter { return value_; } + // Returns if `target` may exist. inline bool SeekForGet(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true; +#endif if (!data_block_hash_index_) { SeekImpl(target); UpdateKey(); @@ -599,11 +757,14 @@ class MetaBlockIter final : public BlockIter { public: MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, - bool block_contents_pinned) { + bool block_contents_pinned, uint8_t protection_bytes_per_key, + const char* kv_checksum, uint32_t block_restart_interval) { // Initializes the iterator with a BytewiseComparator and // the raw key being a user key. InitializeBase(BytewiseComparator(), data, restarts, num_restarts, - kDisableGlobalSequenceNumber, block_contents_pinned); + kDisableGlobalSequenceNumber, block_contents_pinned, + protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(true); } @@ -613,12 +774,17 @@ class MetaBlockIter final : public BlockIter { } protected: + friend Block; void SeekToFirstImpl() override; void SeekToLastImpl() override; void SeekImpl(const Slice& target) override; void SeekForPrevImpl(const Slice& target) override; void NextImpl() override; void PrevImpl() override; + // Meta index block's restart interval is always 1. See + // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval. + uint32_t GetRestartInterval() override { return 1; } + uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; } }; class IndexBlockIter final : public BlockIter { @@ -633,9 +799,13 @@ class IndexBlockIter final : public BlockIter { uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, bool have_first_key, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned) { + bool value_is_full, bool block_contents_pinned, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { InitializeBase(raw_ucmp, data, restarts, num_restarts, - kDisableGlobalSequenceNumber, block_contents_pinned); + kDisableGlobalSequenceNumber, block_contents_pinned, + protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(!key_includes_seq); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; @@ -666,11 +836,17 @@ class IndexBlockIter final : public BlockIter { } } + Slice raw_value() const { + assert(Valid()); + return value_; + } + bool IsValuePinned() const override { return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); } protected: + friend Block; // IndexBlockIter follows a different contract for prefix iterator // from data iterators. // If prefix of the seek key `target` exists in the file, it must @@ -692,11 +868,8 @@ class IndexBlockIter final : public BlockIter { } void PrevImpl() override; - void NextImpl() override; - void SeekToFirstImpl() override; - void SeekToLastImpl() override; private: diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 2a8e44d1c..5121d1b43 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -450,7 +450,12 @@ struct BlockBasedTableBuilder::Rep { table_options, data_block)), create_context(&table_options, ioptions.stats, compression_type == kZSTD || - compression_type == kZSTDNotFinalCompression), + compression_type == kZSTDNotFinalCompression, + tbo.moptions.block_protection_bytes_per_key, + tbo.internal_comparator.user_comparator(), + !use_delta_encoding_for_index_values, + table_opt.index_type == + BlockBasedTableOptions::kBinarySearchWithFirstKey), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index dc852e543..2bca07033 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -567,7 +567,8 @@ Status BlockBasedTableFactory::NewTableReader( return BlockBasedTable::Open( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), - file_size, table_reader, table_reader_cache_res_mgr_, + file_size, table_reader_options.block_protection_bytes_per_key, + table_reader, table_reader_cache_res_mgr_, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 0ed42348f..b51441661 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -560,6 +560,7 @@ Status BlockBasedTable::Open( const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, + uint8_t block_protection_bytes_per_key, std::unique_ptr* table_reader, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, @@ -645,6 +646,7 @@ Status BlockBasedTable::Open( // meta-block reads. rep->compression_dict_handle = BlockHandle::NullBlockHandle(); + rep->create_context.protection_bytes_per_key = block_protection_bytes_per_key; // Read metaindex std::unique_ptr new_table( new BlockBasedTable(rep, block_cache_tracer)); @@ -671,9 +673,11 @@ Status BlockBasedTable::Open( CompressionTypeToString(kZSTD) || rep->table_properties->compression_name == CompressionTypeToString(kZSTDNotFinalCompression)); - rep->create_context = - BlockCreateContext(&rep->table_options, rep->ioptions.stats, - blocks_definitely_zstd_compressed); + rep->create_context = BlockCreateContext( + &rep->table_options, rep->ioptions.stats, + blocks_definitely_zstd_compressed, block_protection_bytes_per_key, + rep->internal_comparator.user_comparator(), rep->index_value_is_full, + rep->index_has_first_key); // Check expected unique id if provided if (expected_unique_id != kNullUniqueId64x2) { @@ -2168,6 +2172,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } } s = biter.status(); + if (!s.ok()) { + break; + } } // Write the block cache access record. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index df296a0d3..dafaa4ebf 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -98,6 +98,7 @@ class BlockBasedTable : public TableReader { const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, std::unique_ptr&& file, uint64_t file_size, + uint8_t block_protection_bytes_per_key, std::unique_ptr* table_reader, std::shared_ptr table_reader_cache_res_mgr = nullptr, diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index eb1175a7d..a6ee940d8 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -116,8 +116,9 @@ class BlockBasedTableReaderBaseTest : public testing::Test { bool prefetch_index_and_filter_in_cache = true, Status* status = nullptr) { const MutableCFOptions moptions(options_); - TableReaderOptions table_reader_options = TableReaderOptions( - ioptions, moptions.prefix_extractor, EnvOptions(), comparator); + TableReaderOptions table_reader_options = + TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(), + comparator, 0 /* block_protection_bytes_per_key */); std::unique_ptr file; NewFileReader(table_name, foptions, &file); diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc index 318d30d84..a252899d2 100644 --- a/table/block_based/block_cache.cc +++ b/table/block_based/block_cache.cc @@ -11,17 +11,25 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kData( std::move(block), table_options->read_amp_bytes_per_bit, statistics)); + parsed_out->get()->InitializeDataBlockProtectionInfo(protection_bytes_per_key, + raw_ucmp); } void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kIndex(std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); } void BlockCreateContext::Create( std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kFilterPartitionIndex( std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); } void BlockCreateContext::Create( std::unique_ptr* parsed_out, BlockContents&& block) { @@ -32,6 +40,8 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kMetaIndex( std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeMetaIndexBlockProtectionInfo( + protection_bytes_per_key); } void BlockCreateContext::Create( diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h index ec39405fe..00eaface3 100644 --- a/table/block_based/block_cache.h +++ b/table/block_based/block_cache.h @@ -70,14 +70,26 @@ class Block_kMetaIndex : public Block { struct BlockCreateContext : public Cache::CreateContext { BlockCreateContext() {} BlockCreateContext(const BlockBasedTableOptions* _table_options, - Statistics* _statistics, bool _using_zstd) + Statistics* _statistics, bool _using_zstd, + uint8_t _protection_bytes_per_key, + const Comparator* _raw_ucmp, + bool _index_value_is_full = false, + bool _index_has_first_key = false) : table_options(_table_options), statistics(_statistics), - using_zstd(_using_zstd) {} + using_zstd(_using_zstd), + protection_bytes_per_key(_protection_bytes_per_key), + raw_ucmp(_raw_ucmp), + index_value_is_full(_index_value_is_full), + index_has_first_key(_index_has_first_key) {} const BlockBasedTableOptions* table_options = nullptr; Statistics* statistics = nullptr; bool using_zstd = false; + uint8_t protection_bytes_per_key = 0; + const Comparator* raw_ucmp = nullptr; + bool index_value_is_full; + bool index_has_first_key; // For TypedCacheInterface template diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index 83b87fe79..90a47ef2c 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -15,6 +15,7 @@ #include #include +#include "db/db_test_util.h" #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" @@ -506,7 +507,7 @@ class IndexBlockTest void GenerateRandomIndexEntries(std::vector *separators, std::vector *block_handles, std::vector *first_keys, - const int len) { + const int len, bool zero_seqno = false) { Random rnd(42); // For each of `len` blocks, we need to generate a first and last key. @@ -514,7 +515,11 @@ void GenerateRandomIndexEntries(std::vector *separators, std::set keys; while ((int)keys.size() < len * 2) { // Keys need to be at least 8 bytes long to look like internal keys. - keys.insert(test::RandomKey(&rnd, 12)); + std::string new_key = test::RandomKey(&rnd, 12); + if (zero_seqno) { + AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue); + } + keys.insert(std::move(new_key)); } uint64_t offset = 0; @@ -618,6 +623,917 @@ INSTANTIATE_TEST_CASE_P(P, IndexBlockTest, std::make_tuple(true, false), std::make_tuple(true, true))); +class BlockPerKVChecksumTest : public DBTestBase { + public: + BlockPerKVChecksumTest() + : DBTestBase("block_per_kv_checksum", /*env_do_fsync=*/false) {} + + template + void TestIterateForward(std::unique_ptr &biter, + size_t &verification_count) { + while (biter->Valid()) { + verification_count = 0; + biter->Next(); + if (biter->Valid()) { + ASSERT_GE(verification_count, 1); + } + } + } + + template + void TestIterateBackward(std::unique_ptr &biter, + size_t &verification_count) { + while (biter->Valid()) { + verification_count = 0; + biter->Prev(); + if (biter->Valid()) { + ASSERT_GE(verification_count, 1); + } + } + } + + template + void TestSeekToFirst(std::unique_ptr &biter, + size_t &verification_count) { + verification_count = 0; + biter->SeekToFirst(); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + } + + template + void TestSeekToLast(std::unique_ptr &biter, + size_t &verification_count) { + verification_count = 0; + biter->SeekToLast(); + ASSERT_GE(verification_count, 1); + TestIterateBackward(biter, verification_count); + } + + template + void TestSeekForPrev(std::unique_ptr &biter, + size_t &verification_count, std::string k) { + verification_count = 0; + biter->SeekForPrev(k); + ASSERT_GE(verification_count, 1); + TestIterateBackward(biter, verification_count); + } + + template + void TestSeek(std::unique_ptr &biter, size_t &verification_count, + std::string k) { + verification_count = 0; + biter->Seek(k); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + } + + bool VerifyChecksum(uint32_t checksum_len, const char *checksum_ptr, + const Slice &key, const Slice &val) { + if (!checksum_len) { + return checksum_ptr == nullptr; + } + return ProtectionInfo64().ProtectKV(key, val).Verify( + static_cast(checksum_len), checksum_ptr); + } +}; + +TEST_F(BlockPerKVChecksumTest, EmptyBlock) { + // Tests that empty block code path is not broken by per kv checksum. + BlockBuilder builder( + 16 /* block_restart_interval */, true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch); + Slice raw_block = builder.Finish(); + BlockContents contents; + contents.data = raw_block; + + std::unique_ptr data_block; + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + create_context.Create(&data_block, std::move(contents)); + std::unique_ptr biter{data_block->NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber)}; + biter->SeekToFirst(); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + Random rnd(33); + biter->SeekForGet(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->SeekToLast(); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->Seek(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->SeekForPrev(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); +} + +TEST_F(BlockPerKVChecksumTest, UnsupportedOptionValue) { + Options options = Options(); + options.block_protection_bytes_per_key = 128; + Destroy(options); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + +TEST_F(BlockPerKVChecksumTest, InitializeProtectionInfo) { + // Make sure that the checksum construction code path does not break + // when the block is itself already corrupted. + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr data_block; + create_context.Create(&data_block, std::move(contents)); + std::unique_ptr iter{data_block->NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr index_block; + create_context.Create(&index_block, std::move(contents)); + std::unique_ptr iter{index_block->NewIndexIterator( + options.comparator, kDisableGlobalSequenceNumber, nullptr, nullptr, + true, false, true, true)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr meta_block; + create_context.Create(&meta_block, std::move(contents)); + std::unique_ptr iter{meta_block->NewMetaIterator(true)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } +} + +TEST_F(BlockPerKVChecksumTest, ApproximateMemory) { + // Tests that ApproximateMemoryUsage() includes memory used by block kv + // checksum. + const int kNumRecords = 20; + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords, 1 /* step */, + 24 /* padding_size */); + std::unique_ptr builder; + auto generate_block_content = [&]() { + builder = std::make_unique(16 /* restart_interval */); + for (int i = 0; i < kNumRecords; ++i) { + builder->Add(keys[i], values[i]); + } + Slice raw_block = builder->Finish(); + BlockContents contents; + contents.data = raw_block; + return contents; + }; + + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext with_checksum_create_context{ + &tbo, + nullptr /* statistics */, + false /* using_zstd */, + protection_bytes_per_key, + options.comparator, + true /* index_value_is_full */}; + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + 0, options.comparator, true /* index_value_is_full */}; + + { + std::unique_ptr data_block; + create_context.Create(&data_block, generate_block_content()); + size_t block_memory = data_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_data_block; + with_checksum_create_context.Create(&with_checksum_data_block, + generate_block_content()); + ASSERT_GT(with_checksum_data_block->ApproximateMemoryUsage() - block_memory, + 100); + } + + { + std::unique_ptr meta_block; + create_context.Create(&meta_block, generate_block_content()); + size_t block_memory = meta_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_meta_block; + with_checksum_create_context.Create(&with_checksum_meta_block, + generate_block_content()); + // Rough comparison to avoid flaky test due to memory allocation alignment. + ASSERT_GT(with_checksum_meta_block->ApproximateMemoryUsage() - block_memory, + 100); + } + + { + // Index block has different contents. + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords); + auto generate_index_content = [&]() { + builder = std::make_unique(16 /* restart_interval */); + BlockHandle last_encoded_handle; + for (int i = 0; i < kNumRecords; ++i) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, false, nullptr); + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder->Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + Slice raw_block = builder->Finish(); + BlockContents contents; + contents.data = raw_block; + return contents; + }; + + std::unique_ptr index_block; + create_context.Create(&index_block, generate_index_content()); + size_t block_memory = index_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_index_block; + with_checksum_create_context.Create(&with_checksum_index_block, + generate_index_content()); + ASSERT_GT( + with_checksum_index_block->ApproximateMemoryUsage() - block_memory, + 100); + } +} + +std::string GetDataBlockIndexTypeStr( + BlockBasedTableOptions::DataBlockIndexType t) { + return t == BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch + ? "BinarySearch" + : "BinaryAndHash"; +} + +class DataBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface> { + public: + DataBlockKVChecksumTest() = default; + + BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const { + return std::get<0>(GetParam()); + } + uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); } + uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); } + bool GetUseDeltaEncoding() const { return std::get<3>(GetParam()); } + + std::unique_ptr GenerateDataBlock( + std::vector &keys, std::vector &values, + int num_record) { + BlockBasedTableOptions tbo; + BlockCreateContext create_context{&tbo, nullptr /* statistics */, + false /* using_zstd */, GetChecksumLen(), + Options().comparator}; + builder_ = std::make_unique( + static_cast(GetRestartInterval()), + GetUseDeltaEncoding() /* use_delta_encoding */, + false /* use_value_delta_encoding */, GetDataBlockIndexType()); + for (int i = 0; i < num_record; i++) { + builder_->Add(keys[i], values[i]); + } + Slice raw_block = builder_->Finish(); + BlockContents contents; + contents.data = raw_block; + std::unique_ptr data_block; + create_context.Create(&data_block, std::move(contents)); + return data_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P( + P, DataBlockKVChecksumTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(0, 1, 2, 4, 8) /* protection_bytes_per_key */, + ::testing::Values(1, 2, 3, 8, 16) /* restart_interval */, + ::testing::Values(false, true)) /* delta_encoding */, + [](const testing::TestParamInfo> + &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) + << "ProtectionPerKey" << std::to_string(std::get<1>(args.param)) + << "RestartInterval" << std::to_string(std::get<2>(args.param)) + << "DeltaEncode" << std::to_string(std::get<3>(args.param)); + return oss.str(); + }); + +TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) { + uint8_t protection_bytes_per_key = GetChecksumLen(); + std::vector num_restart_intervals = {1, 16}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr data_block = + GenerateDataBlock(keys, values, kNumRecords); + + const char *checksum_ptr = data_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, + checksum_ptr + i * protection_bytes_per_key, + keys[i], values[i])); + } + std::vector seqnos{kDisableGlobalSequenceNumber, 0}; + + // Could just use a boolean flag. Use a counter here just to keep open the + // possibility of checking the exact number of verifications in the future. + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). So verification count is incremented even with + // protection_bytes_per_key = 0. No actual checksum computation is done in + // that case (see Block::VerifyChecksum()). + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (const auto seqno : seqnos) { + std::unique_ptr biter{ + data_block->NewDataIterator(Options().comparator, seqno)}; + + // SeekForGet() some key that does not exist + biter->SeekForGet(keys[kNumRecords]); + TestIterateForward(biter, verification_count); + + verification_count = 0; + biter->SeekForGet(keys[kNumRecords / 2]); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]); + TestSeek(biter, verification_count, keys[kNumRecords / 2]); + } + } +} + +class IndexBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface< + std::tuple> { + public: + IndexBlockKVChecksumTest() = default; + + BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const { + return std::get<0>(GetParam()); + } + uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); } + uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); } + bool UseValueDeltaEncoding() const { return std::get<3>(GetParam()); } + bool IncludeFirstKey() const { return std::get<4>(GetParam()); } + + std::unique_ptr GenerateIndexBlock( + std::vector &separators, + std::vector &block_handles, + std::vector &first_keys, int num_record) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{ + &tbo, + nullptr /* statistics */, + false /* _using_zstd */, + protection_bytes_per_key, + options.comparator, + !UseValueDeltaEncoding() /* value_is_full */, + IncludeFirstKey()}; + builder_ = std::make_unique( + static_cast(GetRestartInterval()), true /* use_delta_encoding */, + UseValueDeltaEncoding() /* use_value_delta_encoding */, + GetDataBlockIndexType()); + BlockHandle last_encoded_handle; + for (int i = 0; i < num_record; i++) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, IncludeFirstKey(), nullptr); + if (UseValueDeltaEncoding() && i > 0) { + entry.EncodeTo(&delta_encoded_entry, IncludeFirstKey(), + &last_encoded_handle); + } + + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder_->Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + // read serialized contents of the block + Slice raw_block = builder_->Finish(); + // create block reader + BlockContents contents; + contents.data = raw_block; + std::unique_ptr index_block; + + create_context.Create(&index_block, std::move(contents)); + return index_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P( + P, IndexBlockKVChecksumTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(0, 1, 2, 4, 8), ::testing::Values(1, 3, 8, 16), + ::testing::Values(true, false), ::testing::Values(true, false)), + [](const testing::TestParamInfo< + std::tuple> &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode" + << std::to_string(std::get<3>(args.param)) << "IncludeFirstKey" + << std::to_string(std::get<4>(args.param)); + return oss.str(); + }); + +TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { + Options options = Options(); + uint8_t protection_bytes_per_key = GetChecksumLen(); + std::vector num_restart_intervals = {1, 16}; + std::vector seqnos{kDisableGlobalSequenceNumber, 10001}; + + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + for (const auto seqno : seqnos) { + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords, + seqno != kDisableGlobalSequenceNumber); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr index_block = GenerateIndexBlock( + separators, block_handles, first_keys, kNumRecords); + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + std::unique_ptr biter{index_block->NewIndexIterator( + options.comparator, seqno, kNullIter, kNullStats, + true /* total_order_seek */, IncludeFirstKey() /* have_first_key */, + true /* key_includes_seq */, + !UseValueDeltaEncoding() /* value_is_full */, + true /* block_contents_pinned */, nullptr /* prefix_index */)}; + biter->SeekToFirst(); + const char *checksum_ptr = index_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + // Obtaining the actual content written as value to index block is not + // trivial: delta-encoded value is only persisted when not at block + // restart point and that keys share some byte (see more in + // BlockBuilder::AddWithLastKeyImpl()). So here we just do verification + // using value from iterator unlike tests for DataBlockIter or + // MetaBlockIter. + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, checksum_ptr, + biter->key(), biter->raw_value())); + } + + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). To make the testing code below simpler and not + // having to differentiate 0 vs non-0 checksum_len, we do an explicit + // assert checking on checksum_len here. + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeek(biter, verification_count, first_keys[kNumRecords / 2]); + } + } +} + +class MetaIndexBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface< + uint8_t /* block_protection_bytes_per_key */> { + public: + MetaIndexBlockKVChecksumTest() = default; + uint8_t GetChecksumLen() const { return GetParam(); } + uint32_t GetRestartInterval() const { return 1; } + + std::unique_ptr GenerateMetaIndexBlock( + std::vector &keys, std::vector &values, + int num_record) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + builder_ = + std::make_unique(static_cast(GetRestartInterval())); + // add a bunch of records to a block + for (int i = 0; i < num_record; i++) { + builder_->Add(keys[i], values[i]); + } + Slice raw_block = builder_->Finish(); + BlockContents contents; + contents.data = raw_block; + std::unique_ptr meta_block; + create_context.Create(&meta_block, std::move(contents)); + return meta_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest, + ::testing::Values(0, 1, 2, 4, 8), + [](const testing::TestParamInfo &args) { + std::ostringstream oss; + oss << "ProtBytes" << std::to_string(args.param); + return oss.str(); + }); + +TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + std::vector num_restart_intervals = {1, 16}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = num_restart_interval * GetRestartInterval(); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr meta_block = + GenerateMetaIndexBlock(keys, values, kNumRecords); + const char *checksum_ptr = meta_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, + checksum_ptr + i * protection_bytes_per_key, + keys[i], values[i])); + } + + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). To make the testing code below simpler and not + // having to differentiate 0 vs non-0 checksum_len, we do an explicit assert + // checking on checksum_len here. + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + // Check that block iterator does checksum verification + std::unique_ptr biter{ + meta_block->NewMetaIterator(true /* block_contents_pinned */)}; + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeek(biter, verification_count, keys[kNumRecords / 2]); + TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]); + } +} + +class DataBlockKVChecksumCorruptionTest : public DataBlockKVChecksumTest { + public: + DataBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateDataBlockIter( + std::vector &keys, std::vector &values, + int num_record) { + // During Block construction, we may create block iter to initialize per kv + // checksum. Disable syncpoint that may be created for block iter methods. + SyncPoint::GetInstance()->DisableProcessing(); + block_ = GenerateDataBlock(keys, values, num_record); + std::unique_ptr biter{block_->NewDataIterator( + Options().comparator, kDisableGlobalSequenceNumber)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) { + std::vector num_restart_intervals = {1, 3}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // values generated by GenerateRandomKVs are of length 100 + ++value[10]; + }); + + // Purely for reducing the number of lines of code. + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + + std::string seek_key = keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForGet(k); }); + + typedef void (DataBlockIter::*IterStepAPI)(); + auto test_step = [&](IterStepAPI iter_api, std::string &k) { + IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + if (kNumRecords > 1) { + test_step(&DataBlockIter::Prev, seek_key); + test_step(&DataBlockIter::Next, seek_key); + } + } +} + +INSTANTIATE_TEST_CASE_P( + P, DataBlockKVChecksumCorruptionTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + ::testing::Values(1, 3, 8, 16) /* restart_interval */, + ::testing::Values(false, true)), + [](const testing::TestParamInfo> + &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "DeltaEncode" + << std::to_string(std::get<3>(args.param)); + return oss.str(); + }); + +class IndexBlockKVChecksumCorruptionTest : public IndexBlockKVChecksumTest { + public: + IndexBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateIndexBlockIter( + std::vector &separators, + std::vector &block_handles, + std::vector &first_keys, int num_record, + SequenceNumber seqno) { + SyncPoint::GetInstance()->DisableProcessing(); + block_ = + GenerateIndexBlock(separators, block_handles, first_keys, num_record); + std::unique_ptr biter{block_->NewIndexIterator( + Options().comparator, seqno, nullptr, nullptr, + true /* total_order_seek */, IncludeFirstKey() /* have_first_key */, + true /* key_includes_seq */, + !UseValueDeltaEncoding() /* value_is_full */, + true /* block_contents_pinned */, nullptr /* prefix_index */)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +INSTANTIATE_TEST_CASE_P( + P, IndexBlockKVChecksumCorruptionTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + ::testing::Values(1, 3, 8, 16) /* restart_interval */, + ::testing::Values(true, false), ::testing::Values(true, false)), + [](const testing::TestParamInfo< + std::tuple> &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode" + << std::to_string(std::get<3>(args.param)) << "IncludeFirstKey" + << std::to_string(std::get<4>(args.param)); + return oss.str(); + }); + +TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) { + std::vector num_restart_intervals = {1, 3}; + std::vector seqnos{kDisableGlobalSequenceNumber, 10001}; + + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + for (const auto seqno : seqnos) { + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords, + seqno != kDisableGlobalSequenceNumber); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // value can be delta-encoded with different lengths, so we corrupt + // first bytes here to be safe + ++value[0]; + }); + + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + std::string seek_key = first_keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + std::unique_ptr biter = GenerateIndexBlockIter( + separators, block_handles, first_keys, kNumRecords, seqno); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + + typedef void (IndexBlockIter::*IterStepAPI)(); + auto test_step = [&](IterStepAPI iter_api, std::string &k) { + std::unique_ptr biter = GenerateIndexBlockIter( + separators, block_handles, first_keys, kNumRecords, seqno); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + if (kNumRecords > 1) { + test_step(&IndexBlockIter::Prev, seek_key); + test_step(&IndexBlockIter::Next, seek_key); + } + } + } +} + +class MetaIndexBlockKVChecksumCorruptionTest + : public MetaIndexBlockKVChecksumTest { + public: + MetaIndexBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateMetaIndexBlockIter( + std::vector &keys, std::vector &values, + int num_record) { + SyncPoint::GetInstance()->DisableProcessing(); + block_ = GenerateMetaIndexBlock(keys, values, num_record); + std::unique_ptr biter{ + block_->NewMetaIterator(true /* block_contents_pinned */)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +INSTANTIATE_TEST_CASE_P( + P, MetaIndexBlockKVChecksumCorruptionTest, + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + [](const testing::TestParamInfo &args) { + std::ostringstream oss; + oss << "ProtBytes" << std::to_string(args.param); + return oss.str(); + }); + +TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) { + Options options = Options(); + std::vector num_restart_intervals = {1, 3}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // values generated by GenerateRandomKVs are of length 100 + ++value[10]; + }); + + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + typedef void (MetaBlockIter::*IterStepAPI)(); + std::string seek_key = keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); + + auto test_step = [&](IterStepAPI iter_api, const std::string &k) { + IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + if (kNumRecords > 1) { + test_step(&MetaBlockIter::Prev, seek_key); + test_step(&MetaBlockIter::Next, seek_key); + } + } +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char **argv) { diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index cd2e30833..2841b271d 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -581,8 +581,9 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, const bool kImmortal = true; ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, - internal_comparator, !kSkipFilters, !kImmortal, - level_), + internal_comparator, + 0 /* block_protection_bytes_per_key */, !kSkipFilters, + !kImmortal, level_), std::move(file_reader), sink->contents().size(), &table_reader)); // Search using Get() ReadOptions ro; diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 6d983f9b7..9696a509d 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -266,9 +266,9 @@ class BlockFetcherTest : public testing::Test { const auto* table_options = table_factory_.GetOptions(); ASSERT_NE(table_options, nullptr); - ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, - comparator, std::move(file), file_size, - &table_reader)); + ASSERT_OK(BlockBasedTable::Open( + ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file), + file_size, 0 /* block_protection_bytes_per_key */, &table_reader)); table->reset(reinterpret_cast(table_reader.release())); } diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index fa3e5b47d..f6d6e195d 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -165,10 +165,10 @@ Status SstFileDumper::NewTableReader( const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/, const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, std::unique_ptr* /*table_reader*/) { - auto t_opt = - TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_, - internal_comparator_, false /* skip_filters */, - false /* imortal */, true /* force_direct_prefetch */); + auto t_opt = TableReaderOptions( + ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_, + 0 /* block_protection_bytes_per_key */, false /* skip_filters */, + false /* immortal */, true /* force_direct_prefetch */); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index c95c91743..533b7cd6a 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -56,7 +56,8 @@ Status SstFileReader::Open(const std::string& file_path) { } if (s.ok()) { TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor, - r->soptions, r->ioptions.internal_comparator); + r->soptions, r->ioptions.internal_comparator, + r->moptions.block_protection_bytes_per_key); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader), diff --git a/table/table_builder.h b/table/table_builder.h index 1790f33b1..e1bb4b557 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -37,9 +37,9 @@ struct TableReaderOptions { const std::shared_ptr& _prefix_extractor, const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, - bool _skip_filters = false, bool _immortal = false, - bool _force_direct_prefetch = false, int _level = -1, - BlockCacheTracer* const _block_cache_tracer = nullptr, + uint8_t _block_protection_bytes_per_key, bool _skip_filters = false, + bool _immortal = false, bool _force_direct_prefetch = false, + int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, size_t _max_file_size_for_l0_meta_pin = 0, const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0) @@ -56,7 +56,8 @@ struct TableReaderOptions { max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin), cur_db_session_id(_cur_db_session_id), cur_file_num(_cur_file_num), - unique_id(_unique_id) {} + unique_id(_unique_id), + block_protection_bytes_per_key(_block_protection_bytes_per_key) {} const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; @@ -86,6 +87,8 @@ struct TableReaderOptions { // Known unique_id or {}, kNullUniqueId64x2 means unknown UniqueId64x2 unique_id; + + uint8_t block_protection_bytes_per_key; }; struct TableBuilderOptions { diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 09146f0ef..60c84d7bf 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -144,7 +144,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, new RandomAccessFileReader(std::move(raf), file_name)); s = opts.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, env_options, - ikc), + ikc, 0 /* block_protection_bytes_per_key */), std::move(file_reader), file_size, &table_reader); if (!s.ok()) { fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str()); diff --git a/table/table_test.cc b/table/table_test.cc index de5f6432b..17b8af9f1 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -444,7 +444,9 @@ class TableConstructor : public Constructor { file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, - *last_internal_comparator_, /*skip_filters*/ false, + *last_internal_comparator_, + 0 /* block_protection_bytes_per_key */, + /*skip_filters*/ false, /*immortal*/ false, false, level_, &block_cache_tracer_, moptions.write_buffer_size, "", file_num_, kNullUniqueId64x2, largest_seqno_), @@ -4795,7 +4797,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { options.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(), - ikc), + ikc, 0 /* block_protection_bytes_per_key */), std::move(file_reader), ss_rw.contents().size(), &table_reader); return table_reader->NewIterator( @@ -4964,7 +4966,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(), - GetPlainInternalComparator(options2.comparator)), + GetPlainInternalComparator(options2.comparator), + 0 /* block_protection_bytes_per_key */), std::move(file_reader), sink->contents().size(), &table_reader)); ReadOptions read_options; diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 759f634b2..ea40f5fa0 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1725,6 +1725,10 @@ DEFINE_uint32( "This options determines the size of such checksums. " "Supported values: 0, 1, 2, 4, 8."); +DEFINE_uint32(block_protection_bytes_per_key, 0, + "Enable block per key-value checksum protection. " + "Supported values: 0, 1, 2, 4, 8."); + DEFINE_bool(build_info, false, "Print the build info via GetRocksBuildInfoAsString"); @@ -4565,6 +4569,8 @@ class Benchmark { } options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; + options.block_protection_bytes_per_key = + FLAGS_block_protection_bytes_per_key; } void InitializeOptionsGeneral(Options* opts) { diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index ac478e7ad..b57905ed4 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -37,6 +37,7 @@ default_params = { "backup_one_in": 100000, "batch_protection_bytes_per_key": lambda: random.choice([0, 8]), "memtable_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), + "block_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), "block_size": 16384, "bloom_bits": lambda: random.choice( [random.randint(0, 19), random.lognormvariate(2.3, 1.3)]