Block per key-value checksum (#11287)

Summary:
add option `block_protection_bytes_per_key` and implementation for block per key-value checksum. The main changes are
1. checksum construction and verification in block.cc/h
2. pass the option `block_protection_bytes_per_key` around (mainly for methods defined in table_cache.h)
3. unit tests/crash test updates

Tests:
* Added unit tests
* Crash test: `python3 tools/db_crashtest.py blackbox --simple --block_protection_bytes_per_key=1 --write_buffer_size=1048576`

Follow up (maybe as a separate PR): make sure corruption status returned from BlockIters are correctly handled.

Performance:
Turning on block per KV protection has a non-trivial negative impact on read performance and costs additional memory.
For memory, each block includes additional 24 bytes for checksum-related states beside checksum itself. For CPU, I set up a DB of size ~1.2GB with 5M keys (32 bytes key and 200 bytes value) which compacts to ~5 SST files (target file size 256 MB) in L6 without compression. I tested readrandom performance with various block cache size (to mimic various cache hit rates):

```
SETUP
make OPTIMIZE_LEVEL="-O3" USE_LTO=1 DEBUG_LEVEL=0 -j32 db_bench
./db_bench -benchmarks=fillseq,compact0,waitforcompaction,compact,waitforcompaction -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -target_file_size_base=268435456 --num=5000000 --key_size=32 --value_size=200 --compression_type=none

BENCHMARK
./db_bench --use_existing_db -benchmarks=readtocache,readrandom[-X10] --num=5000000 --key_size=32 --disable_auto_compactions --reads=1000000 --block_protection_bytes_per_key=[0|1] --cache_size=$CACHESIZE

The readrandom ops/sec looks like the following:
Block cache size:  2GB        1.2GB * 0.9    1.2GB * 0.8     1.2GB * 0.5   8MB
Main              240805     223604         198176           161653       139040
PR prot_bytes=0   238691     226693         200127           161082       141153
PR prot_bytes=1   214983     193199         178532           137013       108211
prot_bytes=1 vs    -10%        -15%          -10.8%          -15%        -23%
prot_bytes=0
```

The benchmark has a lot of variance, but there was a 5% to 25% regression in this benchmark with different cache hit rates.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11287

Reviewed By: ajkr

Differential Revision: D43970708

Pulled By: cbi42

fbshipit-source-id: ef98d898b71779846fa74212b9ec9e08b7183940
oxigraph-8.3.2
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 40d69b59ad
commit 62fc15f009
  1. 1
      HISTORY.md
  2. 3
      db/builder.cc
  3. 6
      db/column_family.cc
  4. 8
      db/compaction/compaction_job.cc
  5. 3
      db/compaction/compaction_job_test.cc
  6. 4
      db/convenience.cc
  7. 1
      db/external_sst_file_ingestion_job.cc
  8. 21
      db/forward_iterator.cc
  9. 1
      db/import_column_family_job.cc
  10. 98
      db/kv_checksum.h
  11. 60
      db/memtable.cc
  12. 2
      db/memtable.h
  13. 12
      db/repair.cc
  14. 89
      db/table_cache.cc
  15. 19
      db/table_cache.h
  16. 3
      db/table_cache_sync_and_async.h
  17. 17
      db/version_builder.cc
  18. 3
      db/version_builder.h
  19. 14
      db/version_edit_handler.cc
  20. 53
      db/version_set.cc
  21. 1
      db/version_set_sync_and_async.h
  22. 1
      db_stress_tool/db_stress_common.h
  23. 5
      db_stress_tool/db_stress_gflags.cc
  24. 1
      db_stress_tool/db_stress_test_base.cc
  25. 3
      fuzz/sst_file_writer_fuzzer.cc
  26. 14
      include/rocksdb/advanced_options.h
  27. 4
      options/cf_options.cc
  28. 3
      options/cf_options.h
  29. 2
      options/options_helper.cc
  30. 3
      options/options_settable_test.cc
  31. 222
      table/block_based/block.cc
  32. 221
      table/block_based/block.h
  33. 7
      table/block_based/block_based_table_builder.cc
  34. 3
      table/block_based/block_based_table_factory.cc
  35. 13
      table/block_based/block_based_table_reader.cc
  36. 1
      table/block_based/block_based_table_reader.h
  37. 5
      table/block_based/block_based_table_reader_test.cc
  38. 10
      table/block_based/block_cache.cc
  39. 16
      table/block_based/block_cache.h
  40. 920
      table/block_based/block_test.cc
  41. 5
      table/block_based/data_block_hash_index_test.cc
  42. 6
      table/block_fetcher_test.cc
  43. 8
      table/sst_file_dumper.cc
  44. 3
      table/sst_file_reader.cc
  45. 11
      table/table_builder.h
  46. 2
      table/table_reader_bench.cc
  47. 9
      table/table_test.cc
  48. 6
      tools/db_bench_tool.cc
  49. 1
      tools/db_crashtest.py

@ -1,6 +1,7 @@
# Rocksdb Change Log
## Unreleased
### New Features
* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287).
## 8.2.0 (04/24/2023)
### Public API Changes

@ -380,7 +380,8 @@ Status BuildTable(
MaxFileSizeForL0MetaPin(mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key*/ nullptr,
/*allow_unprepared_value*/ false));
/*allow_unprepared_value*/ false,
mutable_cf_options.block_protection_bytes_per_key));
s = it->status();
if (s.ok() && paranoid_file_checks) {
OutputValidator file_validator(tboptions.internal_comparator,

@ -1428,6 +1428,12 @@ Status ColumnFamilyData::ValidateOptions(
"Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
"or 8 bytes per key.");
}
if (std::find(supported.begin(), supported.end(),
cf_options.block_protection_bytes_per_key) == supported.end()) {
return Status::NotSupported(
"Block per key-value checksum protection only supports 0, 1, 2, 4 "
"or 8 bytes per key.");
}
return s;
}

@ -504,7 +504,9 @@ void CompactionJob::GenSubcompactionBoundaries() {
FileMetaData* f = flevel->files[i].file_metadata;
std::vector<TableReader::Anchor> my_anchors;
Status s = cfd->table_cache()->ApproximateKeyAnchors(
read_options, icomp, *f, my_anchors);
read_options, icomp, *f,
c->mutable_cf_options()->block_protection_bytes_per_key,
my_anchors);
if (!s.ok() || my_anchors.empty()) {
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
}
@ -735,7 +737,9 @@ Status CompactionJob::Run() {
*compact_->compaction->mutable_cf_options()),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false);
/*allow_unprepared_value=*/false,
compact_->compaction->mutable_cf_options()
->block_protection_bytes_per_key);
auto s = iter->status();
if (s.ok() && paranoid_file_checks_) {

@ -454,7 +454,8 @@ class CompactionJobTestBase : public testing::Test {
Status s = cf_options_.table_factory->NewTableReader(
read_opts,
TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
cfd_->internal_comparator()),
cfd_->internal_comparator(),
0 /* block_protection_bytes_per_key */),
std::move(freader), file_size, &table_reader, false);
ASSERT_OK(s);
assert(table_reader);

@ -64,8 +64,8 @@ Status VerifySstFileChecksum(const Options& options,
const bool kImmortal = true;
auto reader_options = TableReaderOptions(
ioptions, options.prefix_extractor, env_options, internal_comparator,
false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
-1 /* level */);
options.block_protection_bytes_per_key, false /* skip_filters */,
!kImmortal, false /* force_direct_prefetch */, -1 /* level */);
reader_options.largest_seqno = largest_seqno;
s = ioptions.table_factory->NewTableReader(
reader_options, std::move(file_reader), file_size, &table_reader,

@ -678,6 +678,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
TableReaderOptions(
*cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
env_options_, cfd_->internal_comparator(),
sv->mutable_cf_options.block_protection_bytes_per_key,
/*skip_filters*/ false, /*immortal*/ false,
/*force_direct_prefetch*/ false, /*level*/ -1,
/*block_cache_tracer*/ nullptr,

@ -36,7 +36,7 @@ class ForwardLevelIterator : public InternalIterator {
const ColumnFamilyData* const cfd, const ReadOptions& read_options,
const std::vector<FileMetaData*>& files,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
bool allow_unprepared_value)
bool allow_unprepared_value, uint8_t block_protection_bytes_per_key)
: cfd_(cfd),
read_options_(read_options),
files_(files),
@ -45,7 +45,8 @@ class ForwardLevelIterator : public InternalIterator {
file_iter_(nullptr),
pinned_iters_mgr_(nullptr),
prefix_extractor_(prefix_extractor),
allow_unprepared_value_(allow_unprepared_value) {
allow_unprepared_value_(allow_unprepared_value),
block_protection_bytes_per_key_(block_protection_bytes_per_key) {
status_.PermitUncheckedError(); // Allow uninitialized status through
}
@ -87,7 +88,8 @@ class ForwardLevelIterator : public InternalIterator {
/*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
/*max_file_size_for_l0_meta_pin=*/0,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, allow_unprepared_value_);
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
block_protection_bytes_per_key_);
file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
valid_ = false;
if (!range_del_agg.IsEmpty()) {
@ -211,6 +213,7 @@ class ForwardLevelIterator : public InternalIterator {
// Kept alive by ForwardIterator::sv_->mutable_cf_options
const std::shared_ptr<const SliceTransform>& prefix_extractor_;
const bool allow_unprepared_value_;
const uint8_t block_protection_bytes_per_key_;
};
ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
@ -738,7 +741,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
/*skip_filters=*/false, /*level=*/-1,
MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, allow_unprepared_value_));
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
sv_->mutable_cf_options.block_protection_bytes_per_key));
}
BuildLevelIterators(vstorage, sv_);
current_ = nullptr;
@ -819,7 +823,8 @@ void ForwardIterator::RenewIterators() {
/*skip_filters=*/false, /*level=*/-1,
MaxFileSizeForL0MetaPin(svnew->mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, allow_unprepared_value_));
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
svnew->mutable_cf_options.block_protection_bytes_per_key));
}
for (auto* f : l0_iters_) {
@ -863,7 +868,8 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage,
} else {
level_iters_.push_back(new ForwardLevelIterator(
cfd_, read_options_, level_files,
sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_));
sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_,
sv->mutable_cf_options.block_protection_bytes_per_key));
}
}
}
@ -885,7 +891,8 @@ void ForwardIterator::ResetIncompleteIterators() {
/*skip_filters=*/false, /*level=*/-1,
MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, allow_unprepared_value_);
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
sv_->mutable_cf_options.block_protection_bytes_per_key);
l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
}

@ -250,6 +250,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
TableReaderOptions(
*cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
env_options_, cfd_->internal_comparator(),
sv->mutable_cf_options.block_protection_bytes_per_key,
/*skip_filters*/ false, /*immortal*/ false,
/*force_direct_prefetch*/ false, /*level*/ -1,
/*block_cache_tracer*/ nullptr,

@ -46,6 +46,8 @@ template <typename T>
class ProtectionInfoKVOC;
template <typename T>
class ProtectionInfoKVOS;
template <typename T>
class ProtectionInfoKV;
// Aliases for 64-bit protection infos.
using ProtectionInfo64 = ProtectionInfo<uint64_t>;
@ -64,13 +66,13 @@ class ProtectionInfo {
ProtectionInfoKVO<T> ProtectKVO(const SliceParts& key,
const SliceParts& value,
ValueType op_type) const;
T GetVal() const { return val_; }
ProtectionInfoKV<T> ProtectKV(const Slice& key, const Slice& value) const;
private:
friend class ProtectionInfoKVO<T>;
friend class ProtectionInfoKVOS<T>;
friend class ProtectionInfoKVOC<T>;
friend class ProtectionInfoKV<T>;
// Each field is hashed with an independent value so we can catch fields being
// swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
@ -89,8 +91,47 @@ class ProtectionInfo {
static_assert(sizeof(ProtectionInfo<T>) == sizeof(T), "");
}
T GetVal() const { return val_; }
void SetVal(T val) { val_ = val; }
void Encode(uint8_t len, char* dst) const {
assert(sizeof(val_) >= len);
switch (len) {
case 1:
dst[0] = static_cast<uint8_t>(val_);
break;
case 2:
EncodeFixed16(dst, static_cast<uint16_t>(val_));
break;
case 4:
EncodeFixed32(dst, static_cast<uint32_t>(val_));
break;
case 8:
EncodeFixed64(dst, static_cast<uint64_t>(val_));
break;
default:
assert(false);
}
}
bool Verify(uint8_t len, const char* checksum_ptr) const {
assert(sizeof(val_) >= len);
switch (len) {
case 1:
return static_cast<uint8_t>(checksum_ptr[0]) ==
static_cast<uint8_t>(val_);
case 2:
return DecodeFixed16(checksum_ptr) == static_cast<uint16_t>(val_);
case 4:
return DecodeFixed32(checksum_ptr) == static_cast<uint32_t>(val_);
case 8:
return DecodeFixed64(checksum_ptr) == static_cast<uint64_t>(val_);
default:
assert(false);
return false;
}
}
T val_ = 0;
};
@ -113,7 +154,14 @@ class ProtectionInfoKVO {
void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
void UpdateO(ValueType old_op_type, ValueType new_op_type);
T GetVal() const { return info_.GetVal(); }
// Encode this protection info into `len` bytes and stores them in `dst`.
void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); }
// Verify this protection info against the protection info encoded by Encode()
// at the first `len` bytes of `checksum_ptr`.
// Returns true iff the verification is successful.
bool Verify(uint8_t len, const char* checksum_ptr) const {
return info_.Verify(len, checksum_ptr);
}
private:
friend class ProtectionInfo<T>;
@ -124,6 +172,7 @@ class ProtectionInfoKVO {
static_assert(sizeof(ProtectionInfoKVO<T>) == sizeof(T), "");
}
T GetVal() const { return info_.GetVal(); }
void SetVal(T val) { info_.SetVal(val); }
ProtectionInfo<T> info_;
@ -154,7 +203,10 @@ class ProtectionInfoKVOC {
void UpdateC(ColumnFamilyId old_column_family_id,
ColumnFamilyId new_column_family_id);
T GetVal() const { return kvo_.GetVal(); }
void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); }
bool Verify(uint8_t len, const char* checksum_ptr) const {
return kvo_.Verify(len, checksum_ptr);
}
private:
friend class ProtectionInfoKVO<T>;
@ -163,6 +215,7 @@ class ProtectionInfoKVOC {
static_assert(sizeof(ProtectionInfoKVOC<T>) == sizeof(T), "");
}
T GetVal() const { return kvo_.GetVal(); }
void SetVal(T val) { kvo_.SetVal(val); }
ProtectionInfoKVO<T> kvo_;
@ -193,7 +246,10 @@ class ProtectionInfoKVOS {
void UpdateS(SequenceNumber old_sequence_number,
SequenceNumber new_sequence_number);
T GetVal() const { return kvo_.GetVal(); }
void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); }
bool Verify(uint8_t len, const char* checksum_ptr) const {
return kvo_.Verify(len, checksum_ptr);
}
private:
friend class ProtectionInfoKVO<T>;
@ -202,11 +258,32 @@ class ProtectionInfoKVOS {
static_assert(sizeof(ProtectionInfoKVOS<T>) == sizeof(T), "");
}
T GetVal() const { return kvo_.GetVal(); }
void SetVal(T val) { kvo_.SetVal(val); }
ProtectionInfoKVO<T> kvo_;
};
template <typename T>
class ProtectionInfoKV {
public:
ProtectionInfoKV() = default;
void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); }
bool Verify(uint8_t len, const char* checksum_ptr) const {
return info_.Verify(len, checksum_ptr);
}
private:
friend class ProtectionInfo<T>;
explicit ProtectionInfoKV(T val) : info_(val) {
static_assert(sizeof(ProtectionInfoKV<T>) == sizeof(T));
}
ProtectionInfo<T> info_;
};
template <typename T>
Status ProtectionInfo<T>::GetStatus() const {
if (val_ != 0) {
@ -244,6 +321,16 @@ ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const SliceParts& key,
return ProtectionInfoKVO<T>(val);
}
template <typename T>
ProtectionInfoKV<T> ProtectionInfo<T>::ProtectKV(const Slice& key,
const Slice& value) const {
T val = GetVal();
val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
val =
val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
return ProtectionInfoKV<T>(val);
}
template <typename T>
void ProtectionInfoKVO<T>::UpdateK(const Slice& old_key, const Slice& new_key) {
T val = GetVal();
@ -394,5 +481,4 @@ void ProtectionInfoKVOS<T>::UpdateS(SequenceNumber old_sequence_number,
sizeof(new_sequence_number), ProtectionInfo<T>::kSeedS));
SetVal(val);
}
} // namespace ROCKSDB_NAMESPACE

@ -256,7 +256,7 @@ void MemTable::UpdateOldestKeyTime() {
}
Status MemTable::VerifyEntryChecksum(const char* entry,
size_t protection_bytes_per_key,
uint32_t protection_bytes_per_key,
bool allow_data_in_errors) {
if (protection_bytes_per_key == 0) {
return Status::OK();
@ -285,28 +285,11 @@ Status MemTable::VerifyEntryChecksum(const char* entry,
Slice value = Slice(value_ptr, value_length);
const char* checksum_ptr = value_ptr + value_length;
uint64_t expected = ProtectionInfo64()
.ProtectKVO(user_key, value, type)
.ProtectS(seq)
.GetVal();
bool match = true;
switch (protection_bytes_per_key) {
case 1:
match = static_cast<uint8_t>(checksum_ptr[0]) ==
static_cast<uint8_t>(expected);
break;
case 2:
match = DecodeFixed16(checksum_ptr) == static_cast<uint16_t>(expected);
break;
case 4:
match = DecodeFixed32(checksum_ptr) == static_cast<uint32_t>(expected);
break;
case 8:
match = DecodeFixed64(checksum_ptr) == expected;
break;
default:
assert(false);
}
bool match =
ProtectionInfo64()
.ProtectKVO(user_key, value, type)
.ProtectS(seq)
.Verify(static_cast<uint8_t>(protection_bytes_per_key), checksum_ptr);
if (!match) {
std::string msg(
"Corrupted memtable entry, per key-value checksum verification "
@ -526,7 +509,7 @@ class MemTableIterator : public InternalIterator {
bool valid_;
bool arena_mode_;
bool value_pinned_;
size_t protection_bytes_per_key_;
uint32_t protection_bytes_per_key_;
Status status_;
Logger* logger_;
@ -684,28 +667,15 @@ void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
return;
}
uint64_t checksum = 0;
if (kv_prot_info == nullptr) {
checksum =
ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal();
ProtectionInfo64()
.ProtectKVO(key, value, type)
.ProtectS(s)
.Encode(static_cast<uint8_t>(moptions_.protection_bytes_per_key),
checksum_ptr);
} else {
checksum = kv_prot_info->GetVal();
}
switch (moptions_.protection_bytes_per_key) {
case 1:
checksum_ptr[0] = static_cast<uint8_t>(checksum);
break;
case 2:
EncodeFixed16(checksum_ptr, static_cast<uint16_t>(checksum));
break;
case 4:
EncodeFixed32(checksum_ptr, static_cast<uint32_t>(checksum));
break;
case 8:
EncodeFixed64(checksum_ptr, checksum);
break;
default:
assert(false);
kv_prot_info->Encode(
static_cast<uint8_t>(moptions_.protection_bytes_per_key), checksum_ptr);
}
}
@ -902,7 +872,7 @@ struct Saver {
ReadCallback* callback_;
bool* is_blob_index;
bool allow_data_in_errors;
size_t protection_bytes_per_key;
uint32_t protection_bytes_per_key;
bool CheckCallback(SequenceNumber _seq) {
if (callback_) {
return callback_->IsVisible(_seq);

@ -529,7 +529,7 @@ class MemTable {
// Returns Corruption status if verification fails.
static Status VerifyEntryChecksum(const char* entry,
size_t protection_bytes_per_key,
uint32_t protection_bytes_per_key,
bool allow_data_in_errors = false);
private:

@ -518,8 +518,9 @@ class Repairer {
if (status.ok()) {
// TODO: plumb Env::IOActivity
const ReadOptions read_options;
status = table_cache_->GetTableProperties(file_options_, read_options,
icmp_, t->meta, &props);
status = table_cache_->GetTableProperties(
file_options_, read_options, icmp_, t->meta, &props,
0 /* block_protection_bytes_per_key */);
}
if (status.ok()) {
auto s =
@ -577,7 +578,8 @@ class Repairer {
/*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false);
/*allow_unprepared_value=*/false,
cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key);
ParsedInternalKey parsed;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
@ -617,7 +619,9 @@ class Repairer {
ReadOptions ropts;
std::unique_ptr<FragmentedRangeTombstoneIterator> r_iter;
status = table_cache_->GetRangeTombstoneIterator(
ropts, cfd->internal_comparator(), t->meta, &r_iter);
ropts, cfd->internal_comparator(), t->meta,
cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key,
&r_iter);
if (r_iter) {
r_iter->SeekToFirst();

@ -91,7 +91,8 @@ Status TableCache::GetTableReader(
const ReadOptions& ro, const FileOptions& file_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats,
HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist,
std::unique_ptr<TableReader>* table_reader,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
@ -140,7 +141,8 @@ Status TableCache::GetTableReader(
s = ioptions_.table_factory->NewTableReader(
ro,
TableReaderOptions(ioptions_, prefix_extractor, file_options,
internal_comparator, skip_filters, immortal_tables_,
internal_comparator, block_protection_bytes_per_key,
skip_filters, immortal_tables_,
false /* force_direct_prefetch */, level,
block_cache_tracer_, max_file_size_for_l0_meta_pin,
db_session_id_, file_meta.fd.GetNumber(),
@ -156,6 +158,7 @@ Status TableCache::FindTable(
const ReadOptions& ro, const FileOptions& file_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, TypedHandle** handle,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist,
bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
@ -179,12 +182,12 @@ Status TableCache::FindTable(
}
std::unique_ptr<TableReader> table_reader;
Status s =
GetTableReader(ro, file_options, internal_comparator, file_meta,
false /* sequential mode */, record_read_stats,
file_read_hist, &table_reader, prefix_extractor,
skip_filters, level, prefetch_index_and_filter_in_cache,
max_file_size_for_l0_meta_pin, file_temperature);
Status s = GetTableReader(ro, file_options, internal_comparator, file_meta,
false /* sequential mode */, record_read_stats,
block_protection_bytes_per_key, file_read_hist,
&table_reader, prefix_extractor, skip_filters,
level, prefetch_index_and_filter_in_cache,
max_file_size_for_l0_meta_pin, file_temperature);
if (!s.ok()) {
assert(table_reader == nullptr);
RecordTick(ioptions_.stats, NO_FILE_ERRORS);
@ -212,6 +215,7 @@ InternalIterator* TableCache::NewIterator(
size_t max_file_size_for_l0_meta_pin,
const InternalKey* smallest_compaction_key,
const InternalKey* largest_compaction_key, bool allow_unprepared_value,
uint8_t block_protection_bytes_per_key,
TruncatedRangeDelIterator** range_del_iter) {
PERF_TIMER_GUARD(new_table_iterator_nanos);
@ -225,12 +229,13 @@ InternalIterator* TableCache::NewIterator(
auto& fd = file_meta.fd;
table_reader = fd.table_reader;
if (table_reader == nullptr) {
s = FindTable(
options, file_options, icomparator, file_meta, &handle,
prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
!for_compaction /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,
max_file_size_for_l0_meta_pin, file_meta.temperature);
s = FindTable(options, file_options, icomparator, file_meta, &handle,
block_protection_bytes_per_key, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
!for_compaction /* record_read_stats */, file_read_hist,
skip_filters, level,
true /* prefetch_index_and_filter_in_cache */,
max_file_size_for_l0_meta_pin, file_meta.temperature);
if (s.ok()) {
table_reader = cache_.Value(handle);
}
@ -308,7 +313,7 @@ InternalIterator* TableCache::NewIterator(
Status TableCache::GetRangeTombstoneIterator(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
assert(out_iter);
const FileDescriptor& fd = file_meta.fd;
@ -317,7 +322,7 @@ Status TableCache::GetRangeTombstoneIterator(
TypedHandle* handle = nullptr;
if (t == nullptr) {
s = FindTable(options, file_options_, internal_comparator, file_meta,
&handle);
&handle, block_protection_bytes_per_key);
if (s.ok()) {
t = cache_.Value(handle);
}
@ -403,6 +408,7 @@ Status TableCache::Get(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
HistogramImpl* file_read_hist, bool skip_filters, int level,
size_t max_file_size_for_l0_meta_pin) {
@ -430,7 +436,7 @@ Status TableCache::Get(
assert(s.ok());
if (t == nullptr) {
s = FindTable(options, file_options_, internal_comparator, file_meta,
&handle, prefix_extractor,
&handle, block_protection_bytes_per_key, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,
@ -513,7 +519,8 @@ Status TableCache::MultiGetFilter(
const FileMetaData& file_meta,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
HistogramImpl* file_read_hist, int level,
MultiGetContext::Range* mget_range, TypedHandle** table_handle) {
MultiGetContext::Range* mget_range, TypedHandle** table_handle,
uint8_t block_protection_bytes_per_key) {
auto& fd = file_meta.fd;
IterKey row_cache_key;
std::string row_cache_entry_buffer;
@ -531,12 +538,13 @@ Status TableCache::MultiGetFilter(
MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
mget_range->end());
if (t == nullptr) {
s = FindTable(
options, file_options_, internal_comparator, file_meta, &handle,
prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, /*skip_filters=*/false,
level, true /* prefetch_index_and_filter_in_cache */,
/*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
s = FindTable(options, file_options_, internal_comparator, file_meta,
&handle, block_protection_bytes_per_key, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist,
/*skip_filters=*/false, level,
true /* prefetch_index_and_filter_in_cache */,
/*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
if (s.ok()) {
t = cache_.Value(handle);
}
@ -564,6 +572,7 @@ Status TableCache::GetTableProperties(
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr<const TableProperties>* properties,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) {
auto table_reader = file_meta.fd.table_reader;
// table already been pre-loaded?
@ -575,7 +584,8 @@ Status TableCache::GetTableProperties(
TypedHandle* table_handle = nullptr;
Status s = FindTable(read_options, file_options, internal_comparator,
file_meta, &table_handle, prefix_extractor, no_io);
file_meta, &table_handle, block_protection_bytes_per_key,
prefix_extractor, no_io);
if (!s.ok()) {
return s;
}
@ -588,12 +598,14 @@ Status TableCache::GetTableProperties(
Status TableCache::ApproximateKeyAnchors(
const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, std::vector<TableReader::Anchor>& anchors) {
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
std::vector<TableReader::Anchor>& anchors) {
Status s;
TableReader* t = file_meta.fd.table_reader;
TypedHandle* handle = nullptr;
if (t == nullptr) {
s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle);
s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle,
block_protection_bytes_per_key);
if (s.ok()) {
t = cache_.Value(handle);
}
@ -610,7 +622,7 @@ Status TableCache::ApproximateKeyAnchors(
size_t TableCache::GetMemoryUsageByTableReader(
const FileOptions& file_options, const ReadOptions& read_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor) {
auto table_reader = file_meta.fd.table_reader;
// table already been pre-loaded?
@ -620,7 +632,8 @@ size_t TableCache::GetMemoryUsageByTableReader(
TypedHandle* table_handle = nullptr;
Status s = FindTable(read_options, file_options, internal_comparator,
file_meta, &table_handle, prefix_extractor, true);
file_meta, &table_handle, block_protection_bytes_per_key,
prefix_extractor, true /* no_io */);
if (!s.ok()) {
return 0;
}
@ -639,16 +652,17 @@ uint64_t TableCache::ApproximateOffsetOf(
const ReadOptions& read_options, const Slice& key,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor) {
uint64_t result = 0;
TableReader* table_reader = file_meta.fd.table_reader;
TypedHandle* table_handle = nullptr;
if (table_reader == nullptr) {
const bool for_compaction = (caller == TableReaderCaller::kCompaction);
Status s =
FindTable(read_options, file_options_, internal_comparator, file_meta,
&table_handle, prefix_extractor, false /* no_io */,
!for_compaction /* record_read_stats */);
Status s = FindTable(
read_options, file_options_, internal_comparator, file_meta,
&table_handle, block_protection_bytes_per_key, prefix_extractor,
false /* no_io */, !for_compaction /* record_read_stats */);
if (s.ok()) {
table_reader = cache_.Value(table_handle);
}
@ -668,16 +682,17 @@ uint64_t TableCache::ApproximateSize(
const ReadOptions& read_options, const Slice& start, const Slice& end,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor) {
uint64_t result = 0;
TableReader* table_reader = file_meta.fd.table_reader;
TypedHandle* table_handle = nullptr;
if (table_reader == nullptr) {
const bool for_compaction = (caller == TableReaderCaller::kCompaction);
Status s =
FindTable(read_options, file_options_, internal_comparator, file_meta,
&table_handle, prefix_extractor, false /* no_io */,
!for_compaction /* record_read_stats */);
Status s = FindTable(
read_options, file_options_, internal_comparator, file_meta,
&table_handle, block_protection_bytes_per_key, prefix_extractor,
false /* no_io */, !for_compaction /* record_read_stats */);
if (s.ok()) {
table_reader = cache_.Value(table_handle);
}

@ -96,6 +96,7 @@ class TableCache {
size_t max_file_size_for_l0_meta_pin,
const InternalKey* smallest_compaction_key,
const InternalKey* largest_compaction_key, bool allow_unprepared_value,
uint8_t protection_bytes_per_key,
TruncatedRangeDelIterator** range_del_iter = nullptr);
// If a seek to internal key "k" in specified file finds an entry,
@ -112,6 +113,7 @@ class TableCache {
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
@ -121,7 +123,7 @@ class TableCache {
Status GetRangeTombstoneIterator(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
// Call table reader's MultiGetFilter to use the bloom filter to filter out
@ -135,7 +137,8 @@ class TableCache {
const FileMetaData& file_meta,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
HistogramImpl* file_read_hist, int level,
MultiGetContext::Range* mget_range, TypedHandle** table_handle);
MultiGetContext::Range* mget_range, TypedHandle** table_handle,
uint8_t block_protection_bytes_per_key);
// If a seek to internal key "k" in specified file finds an entry,
// call get_context->SaveValue() repeatedly until
@ -150,6 +153,7 @@ class TableCache {
Status, MultiGet, const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
bool skip_range_deletions = false, int level = -1,
@ -165,6 +169,7 @@ class TableCache {
const ReadOptions& ro, const FileOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, TypedHandle**,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
const bool no_io = false, bool record_read_stats = true,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
@ -183,12 +188,14 @@ class TableCache {
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr<const TableProperties>* properties,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
bool no_io = false);
Status ApproximateKeyAnchors(const ReadOptions& ro,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
uint8_t block_protection_bytes_per_key,
std::vector<TableReader::Anchor>& anchors);
// Return total memory usage of the table reader of the file.
@ -196,7 +203,7 @@ class TableCache {
size_t GetMemoryUsageByTableReader(
const FileOptions& toptions, const ReadOptions& read_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
// Returns approximated offset of a key in a file represented by fd.
@ -204,6 +211,7 @@ class TableCache {
const ReadOptions& read_options, const Slice& key,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
// Returns approximated data size between start and end keys in a file
@ -212,6 +220,7 @@ class TableCache {
const ReadOptions& read_options, const Slice& start, const Slice& end,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
CacheInterface& get_cache() { return cache_; }
@ -234,8 +243,8 @@ class TableCache {
const ReadOptions& ro, const FileOptions& file_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, bool sequential_mode,
bool record_read_stats, HistogramImpl* file_read_hist,
std::unique_ptr<TableReader>* table_reader,
bool record_read_stats, uint8_t block_protection_bytes_per_key,
HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
bool skip_filters = false, int level = -1,
bool prefetch_index_and_filter_in_cache = true,

@ -17,6 +17,7 @@ namespace ROCKSDB_NAMESPACE {
DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
uint8_t block_protection_bytes_per_key,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions,
int level, TypedHandle* handle) {
@ -65,7 +66,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
if (t == nullptr) {
assert(handle == nullptr);
s = FindTable(options, file_options_, internal_comparator, file_meta,
&handle, prefix_extractor,
&handle, block_protection_bytes_per_key, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,

@ -1257,7 +1257,8 @@ class VersionBuilder::Rep {
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) {
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key) {
assert(table_cache_ != nullptr);
size_t table_cache_capacity =
@ -1326,7 +1327,8 @@ class VersionBuilder::Rep {
statuses[file_idx] = table_cache_->FindTable(
read_options, file_options_,
*(base_vstorage_->InternalComparator()), *file_meta, &handle,
prefix_extractor, false /*no_io */, true /* record_read_stats */,
block_protection_bytes_per_key, prefix_extractor, false /*no_io */,
true /* record_read_stats */,
internal_stats->GetFileReadHist(level), false, level,
prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
file_meta->temperature);
@ -1384,11 +1386,12 @@ Status VersionBuilder::LoadTableHandlers(
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) {
return rep_->LoadTableHandlers(internal_stats, max_threads,
prefetch_index_and_filter_in_cache,
is_initial_load, prefix_extractor,
max_file_size_for_l0_meta_pin, read_options);
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key) {
return rep_->LoadTableHandlers(
internal_stats, max_threads, prefetch_index_and_filter_in_cache,
is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin,
read_options, block_protection_bytes_per_key);
}
uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {

@ -48,7 +48,8 @@ class VersionBuilder {
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options);
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key);
uint64_t GetMinOldestBlobFileNumber() const;
private:

@ -566,13 +566,13 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
assert(builder_iter->second != nullptr);
VersionBuilder* builder = builder_iter->second->version_builder();
assert(builder);
const MutableCFOptions* moptions = cfd->GetLatestMutableCFOptions();
Status s = builder->LoadTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads,
prefetch_index_and_filter_in_cache, is_initial_load,
cfd->GetLatestMutableCFOptions()->prefix_extractor,
MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()),
read_options_);
moptions->prefix_extractor, MaxFileSizeForL0MetaPin(*moptions),
read_options_, moptions->block_protection_bytes_per_key);
if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
s = Status::OK();
}
@ -812,16 +812,16 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
assert(builder);
}
const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions();
auto* version = new Version(cfd, version_set_, version_set_->file_options_,
*cfd->GetLatestMutableCFOptions(), io_tracer_,
*cf_opts_ptr, io_tracer_,
version_set_->current_version_number_++,
epoch_number_requirement_);
s = builder->LoadTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads, false, true,
cfd->GetLatestMutableCFOptions()->prefix_extractor,
MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()),
read_options_);
cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr),
read_options_, cf_opts_ptr->block_protection_bytes_per_key);
if (!s.ok()) {
delete version;
if (s.IsCorruption()) {

@ -941,7 +941,7 @@ class LevelIterator final : public InternalIterator {
const std::shared_ptr<const SliceTransform>& prefix_extractor,
bool should_sample, HistogramImpl* file_read_hist,
TableReaderCaller caller, bool skip_filters, int level,
RangeDelAggregator* range_del_agg,
uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg,
const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
nullptr,
bool allow_unprepared_value = false,
@ -964,6 +964,7 @@ class LevelIterator final : public InternalIterator {
pinned_iters_mgr_(nullptr),
compaction_boundaries_(compaction_boundaries),
is_next_read_sequential_(false),
block_protection_bytes_per_key_(block_protection_bytes_per_key),
range_tombstone_iter_(nullptr),
to_return_sentinel_(false) {
// Empty level is not supported.
@ -1107,7 +1108,8 @@ class LevelIterator final : public InternalIterator {
nullptr /* don't need reference to table */, file_read_hist_, caller_,
/*arena=*/nullptr, skip_filters_, level_,
/*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_);
largest_compaction_key, allow_unprepared_value_,
block_protection_bytes_per_key_, range_tombstone_iter_);
}
// Check if current file being fully within iterate_lower_bound.
@ -1154,6 +1156,8 @@ class LevelIterator final : public InternalIterator {
bool is_next_read_sequential_;
uint8_t block_protection_bytes_per_key_;
// This is set when this level iterator is used under a merging iterator
// that processes range tombstones. range_tombstone_iter_ points to where the
// merging iterator stores the range tombstones iterator for this level. When
@ -1535,6 +1539,7 @@ Status Version::GetTableProperties(const ReadOptions& read_options,
auto ioptions = cfd_->ioptions();
Status s = table_cache->GetTableProperties(
file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp,
mutable_cf_options_.block_protection_bytes_per_key,
mutable_cf_options_.prefix_extractor, true /* no io */);
if (s.ok()) {
return s;
@ -1621,6 +1626,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
Status s = table_cache->GetRangeTombstoneIterator(
read_options, cfd_->internal_comparator(), *file_meta,
cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key,
&tombstone_iter);
if (!s.ok()) {
return s;
@ -1739,6 +1745,7 @@ size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) {
total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
file_options_, read_options, cfd_->internal_comparator(),
*file_level.files[i].file_metadata,
mutable_cf_options_.block_protection_bytes_per_key,
mutable_cf_options_.prefix_extractor);
}
}
@ -1848,6 +1855,7 @@ InternalIterator* Version::TEST_GetLevelIterator(
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
mutable_cf_options_.block_protection_bytes_per_key,
nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
allow_unprepared_value, &tombstone_iter_ptr);
if (read_options.ignore_range_deletions) {
@ -1946,7 +1954,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
/*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, allow_unprepared_value,
&tombstone_iter);
mutable_cf_options_.block_protection_bytes_per_key, &tombstone_iter);
if (read_options.ignore_range_deletions) {
merge_iter_builder->AddIterator(table_iter);
} else {
@ -1975,8 +1983,10 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
/*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
allow_unprepared_value, &tombstone_iter_ptr);
mutable_cf_options_.block_protection_bytes_per_key,
/*range_del_agg=*/nullptr,
/*compaction_boundaries=*/nullptr, allow_unprepared_value,
&tombstone_iter_ptr);
if (read_options.ignore_range_deletions) {
merge_iter_builder->AddIterator(level_iter);
} else {
@ -2019,7 +2029,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
/*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false));
/*allow_unprepared_value=*/false,
mutable_cf_options_.block_protection_bytes_per_key));
status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
iter.get(), overlap);
if (!status.ok() || *overlap) {
@ -2034,7 +2045,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
&range_del_agg));
mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg,
nullptr, false));
status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
iter.get(), overlap);
}
@ -2333,7 +2345,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
*status = table_cache_->Get(
read_options, *internal_comparator(), *f->file_metadata, ikey,
&get_context, mutable_cf_options_.prefix_extractor,
&get_context, mutable_cf_options_.block_protection_bytes_per_key,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
fp.IsHitFileLastInLevel()),
@ -2578,7 +2591,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
read_options, *internal_comparator(), *f->file_metadata,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
fp.GetHitFileLevel(), &file_range, &table_handle);
fp.GetHitFileLevel(), &file_range, &table_handle,
mutable_cf_options_.block_protection_bytes_per_key);
skip_range_deletions = true;
if (status.ok()) {
skip_filters = true;
@ -2768,7 +2782,8 @@ Status Version::ProcessBatch(
read_options, *internal_comparator(), *f->file_metadata,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
fp.GetHitFileLevel(), &file_range, &table_handle);
fp.GetHitFileLevel(), &file_range, &table_handle,
mutable_cf_options_.block_protection_bytes_per_key);
if (status.ok()) {
skip_filters = true;
skip_range_deletions = true;
@ -5217,7 +5232,8 @@ Status VersionSet::ProcessManifestWrites(
true /* prefetch_index_and_filter_in_cache */,
false /* is_initial_load */,
mutable_cf_options_ptrs[i]->prefix_extractor,
MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options);
MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options,
mutable_cf_options_ptrs[i]->block_protection_bytes_per_key);
if (!s.ok()) {
if (db_options_->paranoid_checks) {
break;
@ -6553,10 +6569,11 @@ uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options,
// "key" falls in the range for this table. Add the
// approximate offset of "key" within the table.
TableCache* table_cache = v->cfd_->table_cache();
const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
if (table_cache != nullptr) {
result = table_cache->ApproximateOffsetOf(
read_options, key, *f.file_metadata, caller, icmp,
v->GetMutableCFOptions().prefix_extractor);
cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor);
}
}
return result;
@ -6596,9 +6613,10 @@ uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options,
if (table_cache == nullptr) {
return 0;
}
const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
return table_cache->ApproximateSize(
read_options, start, end, *f.file_metadata, caller, icmp,
v->GetMutableCFOptions().prefix_extractor);
cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor);
}
void VersionSet::RemoveLiveFiles(
@ -6757,6 +6775,7 @@ InternalIterator* VersionSet::MakeInputIterator(
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false,
c->mutable_cf_options()->block_protection_bytes_per_key,
/*range_del_iter=*/&range_tombstone_iter);
range_tombstones.emplace_back(range_tombstone_iter, nullptr);
}
@ -6770,8 +6789,9 @@ InternalIterator* VersionSet::MakeInputIterator(
/*should_sample=*/false,
/*no per level latency histogram=*/nullptr,
TableReaderCaller::kCompaction, /*skip_filters=*/false,
/*level=*/static_cast<int>(c->level(which)), range_del_agg,
c->boundaries(which), false, &tombstone_iter_ptr);
/*level=*/static_cast<int>(c->level(which)),
c->mutable_cf_options()->block_protection_bytes_per_key,
range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr);
range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
}
}
@ -7008,7 +7028,8 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
TableCache::TypedHandle* handle = nullptr;
FileMetaData meta_copy = meta;
status = table_cache->FindTable(
read_options, file_opts, *icmp, meta_copy, &handle, pe,
read_options, file_opts, *icmp, meta_copy, &handle,
cf_opts->block_protection_bytes_per_key, pe,
/*no_io=*/false, /*record_read_stats=*/true,
internal_stats->GetFileReadHist(level), false, level,
/*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,

@ -25,6 +25,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
s = CO_AWAIT(table_cache_->MultiGet)(
read_options, *internal_comparator(), *f->file_metadata, &file_range,
mutable_cf_options_.block_protection_bytes_per_key,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters,
skip_range_deletions, hit_file_level, table_handle);

@ -290,6 +290,7 @@ DECLARE_bool(paranoid_file_checks);
DECLARE_bool(fail_if_options_file_error);
DECLARE_uint64(batch_protection_bytes_per_key);
DECLARE_uint32(memtable_protection_bytes_per_key);
DECLARE_uint32(block_protection_bytes_per_key);
DECLARE_uint64(user_timestamp_size);
DECLARE_string(secondary_cache_uri);

@ -975,6 +975,11 @@ DEFINE_uint32(
"specified number of bytes per key. Currently the supported "
"nonzero values are 1, 2, 4 and 8.");
DEFINE_uint32(block_protection_bytes_per_key, 0,
"If nonzero, enables integrity protection in blocks at the "
"specified number of bytes per key. Currently the supported "
"nonzero values are 1, 2, 4 and 8.");
DEFINE_string(file_checksum_impl, "none",
"Name of an implementation for file_checksum_gen_factory, or "
"\"none\" for null.");

@ -3122,6 +3122,7 @@ void InitializeOptionsFromFlags(
FLAGS_verify_sst_unique_id_in_manifest;
options.memtable_protection_bytes_per_key =
FLAGS_memtable_protection_bytes_per_key;
options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
// Integrated BlobDB
options.enable_blob_files = FLAGS_enable_blob_files;

@ -92,7 +92,8 @@ TableReader* NewTableReader(const std::string& sst_file_path,
if (s.ok()) {
ImmutableOptions iopts(options, cf_ioptions);
TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options,
cf_ioptions.internal_comparator);
cf_ioptions.internal_comparator,
0 /* block_protection_bytes_per_key */);
t_opt.largest_seqno = kMaxSequenceNumber;
s = options.table_factory->NewTableReader(t_opt, std::move(file_reader),
file_size, &table_reader,

@ -1122,6 +1122,20 @@ struct AdvancedColumnFamilyOptions {
// only compatible changes are allowed.
bool persist_user_defined_timestamps = true;
// Enable/disable per key-value checksum protection for in memory blocks.
//
// Checksum is constructed when a block is loaded into memory and verification
// is done for each key read from the block. This is useful for detecting
// in-memory data corruption. Note that this feature has a non-trivial
// negative impact on read performance. Different values of the
// option have similar performance impact, but different memory cost and
// corruption detection probability (e.g. 1 byte gives 255/256 chance for
// detecting a corruption).
//
// Default: 0 (no protection)
// Supported values: 0, 1, 2, 4, 8.
uint8_t block_protection_bytes_per_key = 0;
// Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options

@ -488,6 +488,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct MutableCFOptions, memtable_protection_bytes_per_key),
OptionType::kUInt32T, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"block_protection_bytes_per_key",
{offsetof(struct MutableCFOptions, block_protection_bytes_per_key),
OptionType::kUInt8T, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{kOptNameCompOpts,
OptionTypeInfo::Struct(
kOptNameCompOpts, &compression_options_type_info,

@ -172,6 +172,7 @@ struct MutableCFOptions {
: options.last_level_temperature),
memtable_protection_bytes_per_key(
options.memtable_protection_bytes_per_key),
block_protection_bytes_per_key(options.block_protection_bytes_per_key),
sample_for_compression(
options.sample_for_compression), // TODO: is 0 fine here?
compression_per_level(options.compression_per_level) {
@ -222,6 +223,7 @@ struct MutableCFOptions {
bottommost_compression(kDisableCompressionOption),
last_level_temperature(Temperature::kUnknown),
memtable_protection_bytes_per_key(0),
block_protection_bytes_per_key(0),
sample_for_compression(0) {}
explicit MutableCFOptions(const Options& options);
@ -312,6 +314,7 @@ struct MutableCFOptions {
CompressionOptions bottommost_compression_opts;
Temperature last_level_temperature;
uint32_t memtable_protection_bytes_per_key;
uint8_t block_protection_bytes_per_key;
uint64_t sample_for_compression;
std::vector<CompressionType> compression_per_level;

@ -206,6 +206,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
moptions.experimental_mempurge_threshold;
cf_opts->memtable_protection_bytes_per_key =
moptions.memtable_protection_bytes_per_key;
cf_opts->block_protection_bytes_per_key =
moptions.block_protection_bytes_per_key;
// Compaction related options
cf_opts->disable_auto_compactions = moptions.disable_auto_compactions;

@ -552,7 +552,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"compaction=false;age_for_warm=1;};"
"blob_cache=1M;"
"memtable_protection_bytes_per_key=2;"
"persist_user_defined_timestamps=true;",
"persist_user_defined_timestamps=true;"
"block_protection_bytes_per_key=1;",
new_options));
ASSERT_NE(new_options->blob_cache.get(), nullptr);

@ -30,7 +30,7 @@ namespace ROCKSDB_NAMESPACE {
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively. Will not derefence past "limit".
// "*value_length", respectively. Will not dereference past "limit".
//
// If any errors are detected, returns nullptr. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
@ -137,17 +137,26 @@ struct DecodeEntryV4 {
return DecodeKeyV4()(p, limit, shared, non_shared);
}
};
void DataBlockIter::NextImpl() {
#ifndef NDEBUG
if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return;
#endif
bool is_shared = false;
ParseNextDataKey(&is_shared);
++cur_entry_idx_;
}
void MetaBlockIter::NextImpl() {
bool is_shared = false;
ParseNextKey<CheckAndDecodeEntry>(&is_shared);
++cur_entry_idx_;
}
void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
void IndexBlockIter::NextImpl() {
ParseNextIndexKey();
++cur_entry_idx_;
}
void IndexBlockIter::PrevImpl() {
assert(Valid());
@ -166,6 +175,7 @@ void IndexBlockIter::PrevImpl() {
// Loop until end of current entry hits the start of original entry
while (ParseNextIndexKey() && NextEntryOffset() < original) {
}
--cur_entry_idx_;
}
void MetaBlockIter::PrevImpl() {
@ -187,6 +197,7 @@ void MetaBlockIter::PrevImpl() {
while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
NextEntryOffset() < original) {
}
--cur_entry_idx_;
}
// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
@ -195,6 +206,7 @@ void DataBlockIter::PrevImpl() {
assert(prev_entries_idx_ == -1 ||
static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
--cur_entry_idx_;
// Check if we can use cached prev_entries_
if (prev_entries_idx_ > 0 &&
prev_entries_[prev_entries_idx_].offset == current_) {
@ -319,10 +331,10 @@ void MetaBlockIter::SeekImpl(const Slice& target) {
// inclusive; AND
// 2) the last key of this block has a greater user_key from seek_user_key
//
// If the return value is TRUE, iter location has two possibilies:
// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
// this case, it points to the first key with a larger user_key or a matching
// user_key with a seqno no greater than the seeking seqno.
// If the return value is TRUE, iter location has two possibilities:
// 1) If iter is valid, it is set to a location as if set by SeekImpl(target).
// In this case, it points to the first key with a larger user_key or a
// matching user_key with a seqno no greater than the seeking seqno.
// 2) If the iter is invalid, it means that either all the user_key is less
// than the seek_user_key, or the block ends with a matching user_key but
// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
@ -347,11 +359,11 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
// boundary key: axy@50 (we make minimal assumption about a boundary key)
// Block N+1: [axy@10, ... ]
//
// If seek_key = axy@60, the search will starts from Block N.
// If seek_key = axy@60, the search will start from Block N.
// Even if the user_key is not found in the hash map, the caller still
// have to continue searching the next block.
//
// In this case, we pretend the key is the the last restart interval.
// In this case, we pretend the key is in the last restart interval.
// The while-loop below will search the last restart interval for the
// key. It will stop at the first key that is larger than the seek_key,
// or to the end of the block if no one is larger.
@ -364,12 +376,15 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
assert(restart_index < num_restarts_);
SeekToRestartPoint(restart_index);
current_ = GetRestartPoint(restart_index);
cur_entry_idx_ =
static_cast<int32_t>(restart_index * block_restart_interval_) - 1;
uint32_t limit = restarts_;
if (restart_index + 1 < num_restarts_) {
limit = GetRestartPoint(restart_index + 1);
}
while (current_ < limit) {
++cur_entry_idx_;
bool shared;
// Here we only linear seek the target key inside the restart interval.
// If a key does not exist inside a restart interval, we avoid
@ -381,14 +396,20 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
// we stop at the first potential matching user key.
break;
}
// If the loop exits due to CompareCurrentKey(target) >= 0, then current key
// exists, and its checksum verification will be done in UpdateKey() called
// in SeekForGet().
// TODO(cbi): If this loop exits with current_ == restart_, per key-value
// checksum will not be verified in UpdateKey() since Valid()
// will return false.
}
if (current_ == restarts_) {
// Search reaches to the end of the block. There are three possibilites:
// 1) there is only one user_key match in the block (otherwise collsion).
// Search reaches to the end of the block. There are three possibilities:
// 1) there is only one user_key match in the block (otherwise collision).
// the matching user_key resides in the last restart interval, and it
// is the last key of the restart interval and of the block as well.
// ParseNextKey() skiped it as its [ type | seqno ] is smaller.
// ParseNextKey() skipped it as its [ type | seqno ] is smaller.
//
// 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
// AND all existing user_keys in the restart interval are smaller than
@ -424,6 +445,9 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
}
void IndexBlockIter::SeekImpl(const Slice& target) {
#ifndef NDEBUG
if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return;
#endif
TEST_SYNC_POINT("IndexBlockIter::Seek:0");
PERF_TIMER_GUARD(block_seek_nanos);
if (data_ == nullptr) { // Not init yet
@ -478,7 +502,9 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) {
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
if (!Valid()) {
SeekToLastImpl();
if (status_.ok()) {
SeekToLastImpl();
}
} else {
while (Valid() && CompareCurrentKey(seek_key) > 0) {
PrevImpl();
@ -502,7 +528,9 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
if (!Valid()) {
SeekToLastImpl();
if (status_.ok()) {
SeekToLastImpl();
}
} else {
while (Valid() && CompareCurrentKey(seek_key) > 0) {
PrevImpl();
@ -517,6 +545,7 @@ void DataBlockIter::SeekToFirstImpl() {
SeekToRestartPoint(0);
bool is_shared = false;
ParseNextDataKey(&is_shared);
cur_entry_idx_ = 0;
}
void MetaBlockIter::SeekToFirstImpl() {
@ -526,15 +555,20 @@ void MetaBlockIter::SeekToFirstImpl() {
SeekToRestartPoint(0);
bool is_shared = false;
ParseNextKey<CheckAndDecodeEntry>(&is_shared);
cur_entry_idx_ = 0;
}
void IndexBlockIter::SeekToFirstImpl() {
#ifndef NDEBUG
if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return;
#endif
if (data_ == nullptr) { // Not init yet
return;
}
status_ = Status::OK();
SeekToRestartPoint(0);
ParseNextIndexKey();
cur_entry_idx_ = 0;
}
void DataBlockIter::SeekToLastImpl() {
@ -543,8 +577,10 @@ void DataBlockIter::SeekToLastImpl() {
}
SeekToRestartPoint(num_restarts_ - 1);
bool is_shared = false;
cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
// Keep skipping
++cur_entry_idx_;
}
}
@ -554,9 +590,13 @@ void MetaBlockIter::SeekToLastImpl() {
}
SeekToRestartPoint(num_restarts_ - 1);
bool is_shared = false;
assert(num_restarts_ >= 1);
cur_entry_idx_ =
static_cast<int32_t>((num_restarts_ - 1) * block_restart_interval_);
while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
NextEntryOffset() < restarts_) {
// Keep skipping
// Will probably never reach here since restart_interval is always 1
++cur_entry_idx_;
}
}
@ -566,20 +606,12 @@ void IndexBlockIter::SeekToLastImpl() {
}
status_ = Status::OK();
SeekToRestartPoint(num_restarts_ - 1);
cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
// Keep skipping
++cur_entry_idx_;
}
}
template <class TValue>
void BlockIter<TValue>::CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
raw_key_.Clear();
value_.clear();
}
template <class TValue>
template <typename DecodeEntryFunc>
bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
@ -666,12 +698,12 @@ bool IndexBlockIter::ParseNextIndexKey() {
// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
// ...
// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
// where, k is key, v is value, and its encoding is in parenthesis.
// where, k is key, v is value, and its encoding is in parentheses.
// The format of each key is (shared_size, non_shared_size, shared, non_shared)
// The format of each value, i.e., block handle, is (offset, size) whenever the
// is_shared is false, which included the first entry in each restart point.
// Otherwise the format is delta-size = block handle size - size of last block
// handle.
// Otherwise, the format is delta-size = the size of current block - the size o
// last block.
void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
Slice v(value_.data(), data_ + restarts_ - value_.data());
// Delta encoding is used if `shared` != 0.
@ -710,6 +742,7 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
// to follow it up with NextImpl() to position the iterator at the restart
// key.
SeekToRestartPoint(index);
cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
NextImpl();
if (!skip_linear_scan) {
@ -728,6 +761,8 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
while (true) {
NextImpl();
if (!Valid()) {
// TODO(cbi): per key-value checksum will not be verified in UpdateKey()
// since Valid() will returns false.
break;
}
if (current_ == max_offset) {
@ -976,6 +1011,7 @@ Block::~Block() {
// This sync point can be re-enabled if RocksDB can control the
// initialization order of any/all static options created by the user.
// TEST_SYNC_POINT("Block::~Block");
delete[] kv_checksum_;
}
Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
@ -1035,6 +1071,126 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
}
}
void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
const Comparator* raw_ucmp) {
protection_bytes_per_key_ = 0;
if (protection_bytes_per_key > 0 && num_restarts_ > 0) {
// NewDataIterator() is called with protection_bytes_per_key_ = 0.
// This is intended since checksum is not constructed yet.
//
// We do not know global_seqno yet, so checksum computation and
// verification all assume global_seqno = 0.
std::unique_ptr<DataBlockIter> iter{NewDataIterator(
raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */,
nullptr /* stats */, true /* block_contents_pinned */)};
if (iter->status().ok()) {
block_restart_interval_ = iter->GetRestartInterval();
}
uint32_t num_keys = 0;
if (iter->status().ok()) {
num_keys = iter->NumberOfKeys(block_restart_interval_);
}
if (iter->status().ok()) {
checksum_size_ = num_keys * protection_bytes_per_key;
kv_checksum_ = new char[(size_t)checksum_size_];
size_t i = 0;
iter->SeekToFirst();
while (iter->Valid()) {
GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
iter->key(), iter->value());
iter->Next();
i += protection_bytes_per_key;
}
assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
}
if (!iter->status().ok()) {
size_ = 0; // Error marker
return;
}
protection_bytes_per_key_ = protection_bytes_per_key;
}
}
void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
const Comparator* raw_ucmp,
bool value_is_full,
bool index_has_first_key) {
protection_bytes_per_key_ = 0;
if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
// Note that `global_seqno` and `key_includes_seq` are hardcoded here. They
// do not impact how the index block is parsed. During checksum
// construction/verification, we use the entire key buffer from
// raw_key_.GetKey() returned by iter->key() as the `key` part of key-value
// checksum, and the content of this buffer do not change for different
// values of `global_seqno` or `key_includes_seq`.
std::unique_ptr<IndexBlockIter> iter{NewIndexIterator(
raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr,
nullptr /* Statistics */, true /* total_order_seek */,
index_has_first_key /* have_first_key */, false /* key_includes_seq */,
value_is_full, true /* block_contents_pinned */,
nullptr /* prefix_index */)};
if (iter->status().ok()) {
block_restart_interval_ = iter->GetRestartInterval();
}
uint32_t num_keys = 0;
if (iter->status().ok()) {
num_keys = iter->NumberOfKeys(block_restart_interval_);
}
if (iter->status().ok()) {
checksum_size_ = num_keys * protection_bytes_per_key;
kv_checksum_ = new char[(size_t)checksum_size_];
iter->SeekToFirst();
size_t i = 0;
while (iter->Valid()) {
GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
iter->key(), iter->raw_value());
iter->Next();
i += protection_bytes_per_key;
}
assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
}
if (!iter->status().ok()) {
size_ = 0; // Error marker
return;
}
protection_bytes_per_key_ = protection_bytes_per_key;
}
}
void Block::InitializeMetaIndexBlockProtectionInfo(
uint8_t protection_bytes_per_key) {
protection_bytes_per_key_ = 0;
if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
std::unique_ptr<MetaBlockIter> iter{
NewMetaIterator(true /* block_contents_pinned */)};
if (iter->status().ok()) {
block_restart_interval_ = iter->GetRestartInterval();
}
uint32_t num_keys = 0;
if (iter->status().ok()) {
num_keys = iter->NumberOfKeys(block_restart_interval_);
}
if (iter->status().ok()) {
checksum_size_ = num_keys * protection_bytes_per_key;
kv_checksum_ = new char[(size_t)checksum_size_];
iter->SeekToFirst();
size_t i = 0;
while (iter->Valid()) {
GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
iter->key(), iter->value());
iter->Next();
i += protection_bytes_per_key;
}
assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
}
if (!iter->status().ok()) {
size_ = 0; // Error marker
return;
}
protection_bytes_per_key_ = protection_bytes_per_key;
}
}
MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
MetaBlockIter* iter = new MetaBlockIter();
if (size_ < 2 * sizeof(uint32_t)) {
@ -1045,7 +1201,8 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
iter->Invalidate(Status::OK());
} else {
iter->Initialize(data_, restart_offset_, num_restarts_,
block_contents_pinned);
block_contents_pinned, protection_bytes_per_key_,
kv_checksum_, block_restart_interval_);
}
return iter;
}
@ -1072,7 +1229,8 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
ret_iter->Initialize(
raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
read_amp_bitmap_.get(), block_contents_pinned,
data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
if (read_amp_bitmap_) {
if (read_amp_bitmap_->GetStatistics() != stats) {
// DB changed the Statistics pointer, we need to notify read_amp_bitmap_
@ -1108,8 +1266,9 @@ IndexBlockIter* Block::NewIndexIterator(
total_order_seek ? nullptr : prefix_index;
ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
global_seqno, prefix_index_ptr, have_first_key,
key_includes_seq, value_is_full,
block_contents_pinned);
key_includes_seq, value_is_full, block_contents_pinned,
protection_bytes_per_key_, kv_checksum_,
block_restart_interval_);
}
return ret_iter;
@ -1125,6 +1284,7 @@ size_t Block::ApproximateMemoryUsage() const {
if (read_amp_bitmap_) {
usage += read_amp_bitmap_->ApproximateMemoryUsage();
}
usage += checksum_size_;
return usage;
}

@ -14,6 +14,7 @@
#include <string>
#include <vector>
#include "db/kv_checksum.h"
#include "db/pinned_iterators_manager.h"
#include "port/malloc.h"
#include "rocksdb/advanced_cache.h"
@ -240,6 +241,34 @@ class Block {
// For TypedCacheInterface
const Slice& ContentSlice() const { return contents_.data; }
// Initializes per key-value checksum protection.
// After this method is called, each DataBlockIterator returned
// by NewDataIterator will verify per key-value checksum for any key it read.
void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
const Comparator* raw_ucmp);
// Initializes per key-value checksum protection.
// After this method is called, each IndexBlockIterator returned
// by NewIndexIterator will verify per key-value checksum for any key it read.
// value_is_full and index_has_first_key are needed to be able to parse
// the index block content and construct checksums.
void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
const Comparator* raw_ucmp,
bool value_is_full,
bool index_has_first_key);
// Initializes per key-value checksum protection.
// After this method is called, each MetaBlockIter returned
// by NewMetaIterator will verify per key-value checksum for any key it read.
void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key);
static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len,
const Slice& key, const Slice& value) {
ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr);
}
const char* TEST_GetKVChecksum() const { return kv_checksum_; }
private:
BlockContents contents_;
const char* data_; // contents_.data.data()
@ -247,6 +276,11 @@ class Block {
uint32_t restart_offset_; // Offset in data_ of restart array
uint32_t num_restarts_;
std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
char* kv_checksum_{nullptr};
uint32_t checksum_size_{0};
// Used by block iterators to calculate current key index within a block
uint32_t block_restart_interval_{0};
uint8_t protection_bytes_per_key_{0};
DataBlockHashIndex data_block_hash_index_;
};
@ -269,6 +303,14 @@ class Block {
// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
// "Impl" functions are responsible for positioning `raw_key_` but not
// invoking `UpdateKey()`.
//
// Per key-value checksum is enabled if relevant states are passed in during
// `InitializeBase()`. The checksum verification is done in each call to
// UpdateKey() for the current key. Each subclass is responsible for keeping
// track of cur_entry_idx_, the index of the current key within the block.
// BlockIter uses this index to get the corresponding checksum for current key.
// Additional checksum verification may be done in subclasses if they read keys
// other than the key being processed in UpdateKey().
template <class TValue>
class BlockIter : public InternalIteratorBase<TValue> {
public:
@ -286,9 +328,16 @@ class BlockIter : public InternalIteratorBase<TValue> {
Cleanable::Reset();
}
bool Valid() const override { return current_ < restarts_; }
bool Valid() const override {
// When status_ is not ok, iter should be invalid.
assert(status_.ok() || current_ >= restarts_);
return current_ < restarts_;
}
virtual void SeekToFirst() override final {
#ifndef NDEBUG
if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
#endif
SeekToFirstImpl();
UpdateKey();
}
@ -325,6 +374,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
}
Status status() const override { return status_; }
Slice key() const override {
assert(Valid());
return key_;
@ -337,10 +387,22 @@ class BlockIter : public InternalIteratorBase<TValue> {
(pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
status_.PermitUncheckedError();
}
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
}
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
bool TEST_Corrupt_Callback(const std::string& sync_point) {
bool corrupt = false;
TEST_SYNC_POINT_CALLBACK(sync_point, static_cast<void*>(&corrupt));
if (corrupt) {
CorruptionError();
}
return corrupt;
}
#endif
bool IsKeyPinned() const override {
@ -377,27 +439,74 @@ class BlockIter : public InternalIteratorBase<TValue> {
Status status_;
// Key to be exposed to users.
Slice key_;
SequenceNumber global_seqno_;
// Per key-value checksum related states
const char* kv_checksum_;
int32_t cur_entry_idx_;
uint32_t block_restart_interval_;
uint8_t protection_bytes_per_key_;
bool key_pinned_;
// Whether the block data is guaranteed to outlive this iterator, and
// as long as the cleanup functions are transferred to another class,
// e.g. PinnableSlice, the pointer to the bytes will still be valid.
bool block_contents_pinned_;
SequenceNumber global_seqno_;
virtual void SeekToFirstImpl() = 0;
virtual void SeekToLastImpl() = 0;
virtual void SeekImpl(const Slice& target) = 0;
virtual void SeekForPrevImpl(const Slice& target) = 0;
virtual void NextImpl() = 0;
virtual void PrevImpl() = 0;
// Returns the restart interval of this block.
// Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized.
virtual uint32_t GetRestartInterval() {
if (num_restarts_ <= 1 || data_ == nullptr) {
return 0;
}
SeekToFirstImpl();
uint32_t end_index = GetRestartPoint(1);
uint32_t count = 1;
while (NextEntryOffset() < end_index && status_.ok()) {
assert(Valid());
NextImpl();
++count;
}
return count;
}
// Returns the number of keys in this block.
virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) {
if (num_restarts_ == 0 || data_ == nullptr) {
return 0;
}
uint32_t count = (num_restarts_ - 1) * block_restart_interval;
// Add number of keys from the last restart interval
SeekToRestartPoint(num_restarts_ - 1);
while (NextEntryOffset() < restarts_ && status_.ok()) {
NextImpl();
++count;
}
return count;
}
// Stores whether the current key has a shared bytes with prev key in
// *is_shared.
// Sets raw_key_, value_ to the current parsed key and value.
// Sets restart_index_ to point to the restart interval that contains
// the current key.
template <typename DecodeEntryFunc>
inline bool ParseNextKey(bool* is_shared);
// protection_bytes_per_key, kv_checksum, and block_restart_interval
// are needed only for per kv checksum verification.
void InitializeBase(const Comparator* raw_ucmp, const char* data,
uint32_t restarts, uint32_t num_restarts,
SequenceNumber global_seqno, bool block_contents_pinned) {
SequenceNumber global_seqno, bool block_contents_pinned,
uint8_t protection_bytes_per_key, const char* kv_checksum,
uint32_t block_restart_interval) {
assert(data_ == nullptr); // Ensure it is called only once
assert(num_restarts > 0); // Ensure the param is valid
@ -410,11 +519,41 @@ class BlockIter : public InternalIteratorBase<TValue> {
global_seqno_ = global_seqno;
block_contents_pinned_ = block_contents_pinned;
cache_handle_ = nullptr;
cur_entry_idx_ = -1;
protection_bytes_per_key_ = protection_bytes_per_key;
kv_checksum_ = kv_checksum;
block_restart_interval_ = block_restart_interval;
// Checksum related states are either all 0/nullptr or all non-zero.
// One exception is when num_restarts == 0, block_restart_interval can be 0
// since we are not able to compute it.
assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
(protection_bytes_per_key > 0 && kv_checksum != nullptr &&
(block_restart_interval > 0 || num_restarts == 1)));
}
void CorruptionError(const std::string& error_msg = "bad entry in block") {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption(error_msg);
raw_key_.Clear();
value_.clear();
}
void PerKVChecksumCorruptionError() {
std::string error_msg{
"Corrupted block entry: per key-value checksum verification "
"failed."};
error_msg.append(" Offset: " + std::to_string(current_) + ".");
error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + ".");
CorruptionError(error_msg);
}
// Must be called every time a key is found that needs to be returned to user,
// and may be called when no key is found (as a no-op). Updates `key_`,
// `key_buf_`, and `key_pinned_` with info about the found key.
// Per key-value checksum verification is done if available for the key to be
// returned. Iterator is invalidated with corruption status if checksum
// verification fails.
void UpdateKey() {
key_buf_.Clear();
if (!Valid()) {
@ -433,6 +572,19 @@ class BlockIter : public InternalIteratorBase<TValue> {
key_ = key_buf_.GetInternalKey();
key_pinned_ = false;
}
TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
(void*)value_.data());
TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
&protection_bytes_per_key_);
if (protection_bytes_per_key_ > 0) {
if (!ProtectionInfo64()
.ProtectKV(raw_key_.GetKey(), value_)
.Verify(
protection_bytes_per_key_,
kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
PerKVChecksumCorruptionError();
}
}
}
// Returns the result of `Comparator::Compare()`, where the appropriate
@ -464,7 +616,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
}
uint32_t GetRestartPoint(uint32_t index) {
uint32_t GetRestartPoint(uint32_t index) const {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
@ -479,13 +631,20 @@ class BlockIter : public InternalIteratorBase<TValue> {
value_ = Slice(data_ + offset, 0);
}
void CorruptionError();
protected:
template <typename DecodeKeyFunc>
inline bool BinarySeek(const Slice& target, uint32_t* index,
bool* is_index_key_result);
// Find the first key in restart interval `index` that is >= `target`.
// If there is no such key, iterator is positioned at the first key in
// restart interval `index + 1`.
// If is_index_key_result is true, it positions the iterator at the first key
// in this restart interval.
// Per key-value checksum verification is done for all keys scanned
// up to but not including the last key (the key that current_ points to
// when this function returns). This key's checksum is verified in
// UpdateKey().
void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
bool is_index_key_result);
};
@ -494,22 +653,17 @@ class DataBlockIter final : public BlockIter<Slice> {
public:
DataBlockIter()
: BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
uint32_t num_restarts, SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
DataBlockHashIndex* data_block_hash_index)
: DataBlockIter() {
Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
read_amp_bitmap, block_contents_pinned, data_block_hash_index);
}
void Initialize(const Comparator* raw_ucmp, const char* data,
uint32_t restarts, uint32_t num_restarts,
SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap,
bool block_contents_pinned,
DataBlockHashIndex* data_block_hash_index) {
DataBlockHashIndex* data_block_hash_index,
uint8_t protection_bytes_per_key, const char* kv_checksum,
uint32_t block_restart_interval) {
InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
block_contents_pinned);
block_contents_pinned, protection_bytes_per_key, kv_checksum,
block_restart_interval);
raw_key_.SetIsUserKey(false);
read_amp_bitmap_ = read_amp_bitmap;
last_bitmap_offset_ = current_ + 1;
@ -527,7 +681,11 @@ class DataBlockIter final : public BlockIter<Slice> {
return value_;
}
// Returns if `target` may exist.
inline bool SeekForGet(const Slice& target) {
#ifndef NDEBUG
if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true;
#endif
if (!data_block_hash_index_) {
SeekImpl(target);
UpdateKey();
@ -599,11 +757,14 @@ class MetaBlockIter final : public BlockIter<Slice> {
public:
MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
bool block_contents_pinned) {
bool block_contents_pinned, uint8_t protection_bytes_per_key,
const char* kv_checksum, uint32_t block_restart_interval) {
// Initializes the iterator with a BytewiseComparator and
// the raw key being a user key.
InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
kDisableGlobalSequenceNumber, block_contents_pinned);
kDisableGlobalSequenceNumber, block_contents_pinned,
protection_bytes_per_key, kv_checksum,
block_restart_interval);
raw_key_.SetIsUserKey(true);
}
@ -613,12 +774,17 @@ class MetaBlockIter final : public BlockIter<Slice> {
}
protected:
friend Block;
void SeekToFirstImpl() override;
void SeekToLastImpl() override;
void SeekImpl(const Slice& target) override;
void SeekForPrevImpl(const Slice& target) override;
void NextImpl() override;
void PrevImpl() override;
// Meta index block's restart interval is always 1. See
// MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval.
uint32_t GetRestartInterval() override { return 1; }
uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; }
};
class IndexBlockIter final : public BlockIter<IndexValue> {
@ -633,9 +799,13 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
uint32_t restarts, uint32_t num_restarts,
SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
bool have_first_key, bool key_includes_seq,
bool value_is_full, bool block_contents_pinned) {
bool value_is_full, bool block_contents_pinned,
uint8_t protection_bytes_per_key, const char* kv_checksum,
uint32_t block_restart_interval) {
InitializeBase(raw_ucmp, data, restarts, num_restarts,
kDisableGlobalSequenceNumber, block_contents_pinned);
kDisableGlobalSequenceNumber, block_contents_pinned,
protection_bytes_per_key, kv_checksum,
block_restart_interval);
raw_key_.SetIsUserKey(!key_includes_seq);
prefix_index_ = prefix_index;
value_delta_encoded_ = !value_is_full;
@ -666,11 +836,17 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
}
}
Slice raw_value() const {
assert(Valid());
return value_;
}
bool IsValuePinned() const override {
return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
}
protected:
friend Block;
// IndexBlockIter follows a different contract for prefix iterator
// from data iterators.
// If prefix of the seek key `target` exists in the file, it must
@ -692,11 +868,8 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
}
void PrevImpl() override;
void NextImpl() override;
void SeekToFirstImpl() override;
void SeekToLastImpl() override;
private:

@ -450,7 +450,12 @@ struct BlockBasedTableBuilder::Rep {
table_options, data_block)),
create_context(&table_options, ioptions.stats,
compression_type == kZSTD ||
compression_type == kZSTDNotFinalCompression),
compression_type == kZSTDNotFinalCompression,
tbo.moptions.block_protection_bytes_per_key,
tbo.internal_comparator.user_comparator(),
!use_delta_encoding_for_index_values,
table_opt.index_type ==
BlockBasedTableOptions::kBinarySearchWithFirstKey),
status_ok(true),
io_status_ok(true) {
if (tbo.target_file_size == 0) {

@ -567,7 +567,8 @@ Status BlockBasedTableFactory::NewTableReader(
return BlockBasedTable::Open(
ro, table_reader_options.ioptions, table_reader_options.env_options,
table_options_, table_reader_options.internal_comparator, std::move(file),
file_size, table_reader, table_reader_cache_res_mgr_,
file_size, table_reader_options.block_protection_bytes_per_key,
table_reader, table_reader_cache_res_mgr_,
table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
table_reader_options.skip_filters, table_reader_options.level,
table_reader_options.immortal, table_reader_options.largest_seqno,

@ -560,6 +560,7 @@ Status BlockBasedTable::Open(
const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_comparator,
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
uint8_t block_protection_bytes_per_key,
std::unique_ptr<TableReader>* table_reader,
std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
@ -645,6 +646,7 @@ Status BlockBasedTable::Open(
// meta-block reads.
rep->compression_dict_handle = BlockHandle::NullBlockHandle();
rep->create_context.protection_bytes_per_key = block_protection_bytes_per_key;
// Read metaindex
std::unique_ptr<BlockBasedTable> new_table(
new BlockBasedTable(rep, block_cache_tracer));
@ -671,9 +673,11 @@ Status BlockBasedTable::Open(
CompressionTypeToString(kZSTD) ||
rep->table_properties->compression_name ==
CompressionTypeToString(kZSTDNotFinalCompression));
rep->create_context =
BlockCreateContext(&rep->table_options, rep->ioptions.stats,
blocks_definitely_zstd_compressed);
rep->create_context = BlockCreateContext(
&rep->table_options, rep->ioptions.stats,
blocks_definitely_zstd_compressed, block_protection_bytes_per_key,
rep->internal_comparator.user_comparator(), rep->index_value_is_full,
rep->index_has_first_key);
// Check expected unique id if provided
if (expected_unique_id != kNullUniqueId64x2) {
@ -2168,6 +2172,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
}
}
s = biter.status();
if (!s.ok()) {
break;
}
}
// Write the block cache access record.
if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {

@ -98,6 +98,7 @@ class BlockBasedTable : public TableReader {
const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_key_comparator,
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
uint8_t block_protection_bytes_per_key,
std::unique_ptr<TableReader>* table_reader,
std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr =
nullptr,

@ -116,8 +116,9 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
bool prefetch_index_and_filter_in_cache = true,
Status* status = nullptr) {
const MutableCFOptions moptions(options_);
TableReaderOptions table_reader_options = TableReaderOptions(
ioptions, moptions.prefix_extractor, EnvOptions(), comparator);
TableReaderOptions table_reader_options =
TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
comparator, 0 /* block_protection_bytes_per_key */);
std::unique_ptr<RandomAccessFileReader> file;
NewFileReader(table_name, foptions, &file);

@ -11,17 +11,25 @@ void BlockCreateContext::Create(std::unique_ptr<Block_kData>* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kData(
std::move(block), table_options->read_amp_bytes_per_bit, statistics));
parsed_out->get()->InitializeDataBlockProtectionInfo(protection_bytes_per_key,
raw_ucmp);
}
void BlockCreateContext::Create(std::unique_ptr<Block_kIndex>* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kIndex(std::move(block),
/*read_amp_bytes_per_bit*/ 0, statistics));
parsed_out->get()->InitializeIndexBlockProtectionInfo(
protection_bytes_per_key, raw_ucmp, index_value_is_full,
index_has_first_key);
}
void BlockCreateContext::Create(
std::unique_ptr<Block_kFilterPartitionIndex>* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kFilterPartitionIndex(
std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics));
parsed_out->get()->InitializeIndexBlockProtectionInfo(
protection_bytes_per_key, raw_ucmp, index_value_is_full,
index_has_first_key);
}
void BlockCreateContext::Create(
std::unique_ptr<Block_kRangeDeletion>* parsed_out, BlockContents&& block) {
@ -32,6 +40,8 @@ void BlockCreateContext::Create(std::unique_ptr<Block_kMetaIndex>* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kMetaIndex(
std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics));
parsed_out->get()->InitializeMetaIndexBlockProtectionInfo(
protection_bytes_per_key);
}
void BlockCreateContext::Create(

@ -70,14 +70,26 @@ class Block_kMetaIndex : public Block {
struct BlockCreateContext : public Cache::CreateContext {
BlockCreateContext() {}
BlockCreateContext(const BlockBasedTableOptions* _table_options,
Statistics* _statistics, bool _using_zstd)
Statistics* _statistics, bool _using_zstd,
uint8_t _protection_bytes_per_key,
const Comparator* _raw_ucmp,
bool _index_value_is_full = false,
bool _index_has_first_key = false)
: table_options(_table_options),
statistics(_statistics),
using_zstd(_using_zstd) {}
using_zstd(_using_zstd),
protection_bytes_per_key(_protection_bytes_per_key),
raw_ucmp(_raw_ucmp),
index_value_is_full(_index_value_is_full),
index_has_first_key(_index_has_first_key) {}
const BlockBasedTableOptions* table_options = nullptr;
Statistics* statistics = nullptr;
bool using_zstd = false;
uint8_t protection_bytes_per_key = 0;
const Comparator* raw_ucmp = nullptr;
bool index_value_is_full;
bool index_has_first_key;
// For TypedCacheInterface
template <typename TBlocklike>

@ -15,6 +15,7 @@
#include <utility>
#include <vector>
#include "db/db_test_util.h"
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
@ -506,7 +507,7 @@ class IndexBlockTest
void GenerateRandomIndexEntries(std::vector<std::string> *separators,
std::vector<BlockHandle> *block_handles,
std::vector<std::string> *first_keys,
const int len) {
const int len, bool zero_seqno = false) {
Random rnd(42);
// For each of `len` blocks, we need to generate a first and last key.
@ -514,7 +515,11 @@ void GenerateRandomIndexEntries(std::vector<std::string> *separators,
std::set<std::string> keys;
while ((int)keys.size() < len * 2) {
// Keys need to be at least 8 bytes long to look like internal keys.
keys.insert(test::RandomKey(&rnd, 12));
std::string new_key = test::RandomKey(&rnd, 12);
if (zero_seqno) {
AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue);
}
keys.insert(std::move(new_key));
}
uint64_t offset = 0;
@ -618,6 +623,917 @@ INSTANTIATE_TEST_CASE_P(P, IndexBlockTest,
std::make_tuple(true, false),
std::make_tuple(true, true)));
class BlockPerKVChecksumTest : public DBTestBase {
public:
BlockPerKVChecksumTest()
: DBTestBase("block_per_kv_checksum", /*env_do_fsync=*/false) {}
template <typename TBlockIter>
void TestIterateForward(std::unique_ptr<TBlockIter> &biter,
size_t &verification_count) {
while (biter->Valid()) {
verification_count = 0;
biter->Next();
if (biter->Valid()) {
ASSERT_GE(verification_count, 1);
}
}
}
template <typename TBlockIter>
void TestIterateBackward(std::unique_ptr<TBlockIter> &biter,
size_t &verification_count) {
while (biter->Valid()) {
verification_count = 0;
biter->Prev();
if (biter->Valid()) {
ASSERT_GE(verification_count, 1);
}
}
}
template <typename TBlockIter>
void TestSeekToFirst(std::unique_ptr<TBlockIter> &biter,
size_t &verification_count) {
verification_count = 0;
biter->SeekToFirst();
ASSERT_GE(verification_count, 1);
TestIterateForward(biter, verification_count);
}
template <typename TBlockIter>
void TestSeekToLast(std::unique_ptr<TBlockIter> &biter,
size_t &verification_count) {
verification_count = 0;
biter->SeekToLast();
ASSERT_GE(verification_count, 1);
TestIterateBackward(biter, verification_count);
}
template <typename TBlockIter>
void TestSeekForPrev(std::unique_ptr<TBlockIter> &biter,
size_t &verification_count, std::string k) {
verification_count = 0;
biter->SeekForPrev(k);
ASSERT_GE(verification_count, 1);
TestIterateBackward(biter, verification_count);
}
template <typename TBlockIter>
void TestSeek(std::unique_ptr<TBlockIter> &biter, size_t &verification_count,
std::string k) {
verification_count = 0;
biter->Seek(k);
ASSERT_GE(verification_count, 1);
TestIterateForward(biter, verification_count);
}
bool VerifyChecksum(uint32_t checksum_len, const char *checksum_ptr,
const Slice &key, const Slice &val) {
if (!checksum_len) {
return checksum_ptr == nullptr;
}
return ProtectionInfo64().ProtectKV(key, val).Verify(
static_cast<uint8_t>(checksum_len), checksum_ptr);
}
};
TEST_F(BlockPerKVChecksumTest, EmptyBlock) {
// Tests that empty block code path is not broken by per kv checksum.
BlockBuilder builder(
16 /* block_restart_interval */, true /* use_delta_encoding */,
false /* use_value_delta_encoding */,
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch);
Slice raw_block = builder.Finish();
BlockContents contents;
contents.data = raw_block;
std::unique_ptr<Block_kData> data_block;
Options options = Options();
BlockBasedTableOptions tbo;
uint8_t protection_bytes_per_key = 8;
BlockCreateContext create_context{
&tbo, nullptr /* statistics */, false /* using_zstd */,
protection_bytes_per_key, options.comparator};
create_context.Create(&data_block, std::move(contents));
std::unique_ptr<DataBlockIter> biter{data_block->NewDataIterator(
options.comparator, kDisableGlobalSequenceNumber)};
biter->SeekToFirst();
ASSERT_FALSE(biter->Valid());
ASSERT_OK(biter->status());
Random rnd(33);
biter->SeekForGet(GenerateInternalKey(1, 1, 10, &rnd));
ASSERT_FALSE(biter->Valid());
ASSERT_OK(biter->status());
biter->SeekToLast();
ASSERT_FALSE(biter->Valid());
ASSERT_OK(biter->status());
biter->Seek(GenerateInternalKey(1, 1, 10, &rnd));
ASSERT_FALSE(biter->Valid());
ASSERT_OK(biter->status());
biter->SeekForPrev(GenerateInternalKey(1, 1, 10, &rnd));
ASSERT_FALSE(biter->Valid());
ASSERT_OK(biter->status());
}
TEST_F(BlockPerKVChecksumTest, UnsupportedOptionValue) {
Options options = Options();
options.block_protection_bytes_per_key = 128;
Destroy(options);
ASSERT_TRUE(TryReopen(options).IsNotSupported());
}
TEST_F(BlockPerKVChecksumTest, InitializeProtectionInfo) {
// Make sure that the checksum construction code path does not break
// when the block is itself already corrupted.
Options options = Options();
BlockBasedTableOptions tbo;
uint8_t protection_bytes_per_key = 8;
BlockCreateContext create_context{
&tbo, nullptr /* statistics */, false /* using_zstd */,
protection_bytes_per_key, options.comparator};
{
std::string invalid_content = "1";
Slice raw_block = invalid_content;
BlockContents contents;
contents.data = raw_block;
std::unique_ptr<Block_kData> data_block;
create_context.Create(&data_block, std::move(contents));
std::unique_ptr<DataBlockIter> iter{data_block->NewDataIterator(
options.comparator, kDisableGlobalSequenceNumber)};
ASSERT_TRUE(iter->status().IsCorruption());
}
{
std::string invalid_content = "1";
Slice raw_block = invalid_content;
BlockContents contents;
contents.data = raw_block;
std::unique_ptr<Block_kIndex> index_block;
create_context.Create(&index_block, std::move(contents));
std::unique_ptr<IndexBlockIter> iter{index_block->NewIndexIterator(
options.comparator, kDisableGlobalSequenceNumber, nullptr, nullptr,
true, false, true, true)};
ASSERT_TRUE(iter->status().IsCorruption());
}
{
std::string invalid_content = "1";
Slice raw_block = invalid_content;
BlockContents contents;
contents.data = raw_block;
std::unique_ptr<Block_kMetaIndex> meta_block;
create_context.Create(&meta_block, std::move(contents));
std::unique_ptr<MetaBlockIter> iter{meta_block->NewMetaIterator(true)};
ASSERT_TRUE(iter->status().IsCorruption());
}
}
TEST_F(BlockPerKVChecksumTest, ApproximateMemory) {
// Tests that ApproximateMemoryUsage() includes memory used by block kv
// checksum.
const int kNumRecords = 20;
std::vector<std::string> keys;
std::vector<std::string> values;
GenerateRandomKVs(&keys, &values, 0, kNumRecords, 1 /* step */,
24 /* padding_size */);
std::unique_ptr<BlockBuilder> builder;
auto generate_block_content = [&]() {
builder = std::make_unique<BlockBuilder>(16 /* restart_interval */);
for (int i = 0; i < kNumRecords; ++i) {
builder->Add(keys[i], values[i]);
}
Slice raw_block = builder->Finish();
BlockContents contents;
contents.data = raw_block;
return contents;
};
Options options = Options();
BlockBasedTableOptions tbo;
uint8_t protection_bytes_per_key = 8;
BlockCreateContext with_checksum_create_context{
&tbo,
nullptr /* statistics */,
false /* using_zstd */,
protection_bytes_per_key,
options.comparator,
true /* index_value_is_full */};
BlockCreateContext create_context{
&tbo, nullptr /* statistics */, false /* using_zstd */,
0, options.comparator, true /* index_value_is_full */};
{
std::unique_ptr<Block_kData> data_block;
create_context.Create(&data_block, generate_block_content());
size_t block_memory = data_block->ApproximateMemoryUsage();
std::unique_ptr<Block_kData> with_checksum_data_block;
with_checksum_create_context.Create(&with_checksum_data_block,
generate_block_content());
ASSERT_GT(with_checksum_data_block->ApproximateMemoryUsage() - block_memory,
100);
}
{
std::unique_ptr<Block_kData> meta_block;
create_context.Create(&meta_block, generate_block_content());
size_t block_memory = meta_block->ApproximateMemoryUsage();
std::unique_ptr<Block_kData> with_checksum_meta_block;
with_checksum_create_context.Create(&with_checksum_meta_block,
generate_block_content());
// Rough comparison to avoid flaky test due to memory allocation alignment.
ASSERT_GT(with_checksum_meta_block->ApproximateMemoryUsage() - block_memory,
100);
}
{
// Index block has different contents.
std::vector<std::string> separators;
std::vector<BlockHandle> block_handles;
std::vector<std::string> first_keys;
GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
kNumRecords);
auto generate_index_content = [&]() {
builder = std::make_unique<BlockBuilder>(16 /* restart_interval */);
BlockHandle last_encoded_handle;
for (int i = 0; i < kNumRecords; ++i) {
IndexValue entry(block_handles[i], first_keys[i]);
std::string encoded_entry;
std::string delta_encoded_entry;
entry.EncodeTo(&encoded_entry, false, nullptr);
last_encoded_handle = entry.handle;
const Slice delta_encoded_entry_slice(delta_encoded_entry);
builder->Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
}
Slice raw_block = builder->Finish();
BlockContents contents;
contents.data = raw_block;
return contents;
};
std::unique_ptr<Block_kIndex> index_block;
create_context.Create(&index_block, generate_index_content());
size_t block_memory = index_block->ApproximateMemoryUsage();
std::unique_ptr<Block_kIndex> with_checksum_index_block;
with_checksum_create_context.Create(&with_checksum_index_block,
generate_index_content());
ASSERT_GT(
with_checksum_index_block->ApproximateMemoryUsage() - block_memory,
100);
}
}
std::string GetDataBlockIndexTypeStr(
BlockBasedTableOptions::DataBlockIndexType t) {
return t == BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch
? "BinarySearch"
: "BinaryAndHash";
}
class DataBlockKVChecksumTest
: public BlockPerKVChecksumTest,
public testing::WithParamInterface<std::tuple<
BlockBasedTableOptions::DataBlockIndexType,
uint8_t /* block_protection_bytes_per_key */,
uint32_t /* restart_interval*/, bool /* use_delta_encoding */>> {
public:
DataBlockKVChecksumTest() = default;
BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const {
return std::get<0>(GetParam());
}
uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); }
uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); }
bool GetUseDeltaEncoding() const { return std::get<3>(GetParam()); }
std::unique_ptr<Block_kData> GenerateDataBlock(
std::vector<std::string> &keys, std::vector<std::string> &values,
int num_record) {
BlockBasedTableOptions tbo;
BlockCreateContext create_context{&tbo, nullptr /* statistics */,
false /* using_zstd */, GetChecksumLen(),
Options().comparator};
builder_ = std::make_unique<BlockBuilder>(
static_cast<int>(GetRestartInterval()),
GetUseDeltaEncoding() /* use_delta_encoding */,
false /* use_value_delta_encoding */, GetDataBlockIndexType());
for (int i = 0; i < num_record; i++) {
builder_->Add(keys[i], values[i]);
}
Slice raw_block = builder_->Finish();
BlockContents contents;
contents.data = raw_block;
std::unique_ptr<Block_kData> data_block;
create_context.Create(&data_block, std::move(contents));
return data_block;
}
std::unique_ptr<BlockBuilder> builder_;
};
INSTANTIATE_TEST_CASE_P(
P, DataBlockKVChecksumTest,
::testing::Combine(
::testing::Values(
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch,
BlockBasedTableOptions::DataBlockIndexType::
kDataBlockBinaryAndHash),
::testing::Values(0, 1, 2, 4, 8) /* protection_bytes_per_key */,
::testing::Values(1, 2, 3, 8, 16) /* restart_interval */,
::testing::Values(false, true)) /* delta_encoding */,
[](const testing::TestParamInfo<std::tuple<
BlockBasedTableOptions::DataBlockIndexType, uint8_t, uint32_t, bool>>
&args) {
std::ostringstream oss;
oss << GetDataBlockIndexTypeStr(std::get<0>(args.param))
<< "ProtectionPerKey" << std::to_string(std::get<1>(args.param))
<< "RestartInterval" << std::to_string(std::get<2>(args.param))
<< "DeltaEncode" << std::to_string(std::get<3>(args.param));
return oss.str();
});
TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) {
uint8_t protection_bytes_per_key = GetChecksumLen();
std::vector<int> num_restart_intervals = {1, 16};
for (const auto num_restart_interval : num_restart_intervals) {
const int kNumRecords =
num_restart_interval * static_cast<int>(GetRestartInterval());
std::vector<std::string> keys;
std::vector<std::string> values;
GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
24 /* padding_size */);
SyncPoint::GetInstance()->DisableProcessing();
std::unique_ptr<Block_kData> data_block =
GenerateDataBlock(keys, values, kNumRecords);
const char *checksum_ptr = data_block->TEST_GetKVChecksum();
// Check checksum of correct length is generated
for (int i = 0; i < kNumRecords; i++) {
ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key,
checksum_ptr + i * protection_bytes_per_key,
keys[i], values[i]));
}
std::vector<SequenceNumber> seqnos{kDisableGlobalSequenceNumber, 0};
// Could just use a boolean flag. Use a counter here just to keep open the
// possibility of checking the exact number of verifications in the future.
size_t verification_count = 0;
// The SyncPoint is placed before checking checksum_len == 0 in
// Block::VerifyChecksum(). So verification count is incremented even with
// protection_bytes_per_key = 0. No actual checksum computation is done in
// that case (see Block::VerifyChecksum()).
SyncPoint::GetInstance()->SetCallBack(
"Block::VerifyChecksum::checksum_len",
[&verification_count, protection_bytes_per_key](void *checksum_len) {
ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
protection_bytes_per_key);
++verification_count;
});
SyncPoint::GetInstance()->EnableProcessing();
for (const auto seqno : seqnos) {
std::unique_ptr<DataBlockIter> biter{
data_block->NewDataIterator(Options().comparator, seqno)};
// SeekForGet() some key that does not exist
biter->SeekForGet(keys[kNumRecords]);
TestIterateForward(biter, verification_count);
verification_count = 0;
biter->SeekForGet(keys[kNumRecords / 2]);
ASSERT_GE(verification_count, 1);
TestIterateForward(biter, verification_count);
TestSeekToFirst(biter, verification_count);
TestSeekToLast(biter, verification_count);
TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]);
TestSeek(biter, verification_count, keys[kNumRecords / 2]);
}
}
}
class IndexBlockKVChecksumTest
: public BlockPerKVChecksumTest,
public testing::WithParamInterface<
std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
uint32_t, bool, bool>> {
public:
IndexBlockKVChecksumTest() = default;
BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const {
return std::get<0>(GetParam());
}
uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); }
uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); }
bool UseValueDeltaEncoding() const { return std::get<3>(GetParam()); }
bool IncludeFirstKey() const { return std::get<4>(GetParam()); }
std::unique_ptr<Block_kIndex> GenerateIndexBlock(
std::vector<std::string> &separators,
std::vector<BlockHandle> &block_handles,
std::vector<std::string> &first_keys, int num_record) {
Options options = Options();
BlockBasedTableOptions tbo;
uint8_t protection_bytes_per_key = GetChecksumLen();
BlockCreateContext create_context{
&tbo,
nullptr /* statistics */,
false /* _using_zstd */,
protection_bytes_per_key,
options.comparator,
!UseValueDeltaEncoding() /* value_is_full */,
IncludeFirstKey()};
builder_ = std::make_unique<BlockBuilder>(
static_cast<int>(GetRestartInterval()), true /* use_delta_encoding */,
UseValueDeltaEncoding() /* use_value_delta_encoding */,
GetDataBlockIndexType());
BlockHandle last_encoded_handle;
for (int i = 0; i < num_record; i++) {
IndexValue entry(block_handles[i], first_keys[i]);
std::string encoded_entry;
std::string delta_encoded_entry;
entry.EncodeTo(&encoded_entry, IncludeFirstKey(), nullptr);
if (UseValueDeltaEncoding() && i > 0) {
entry.EncodeTo(&delta_encoded_entry, IncludeFirstKey(),
&last_encoded_handle);
}
last_encoded_handle = entry.handle;
const Slice delta_encoded_entry_slice(delta_encoded_entry);
builder_->Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
}
// read serialized contents of the block
Slice raw_block = builder_->Finish();
// create block reader
BlockContents contents;
contents.data = raw_block;
std::unique_ptr<Block_kIndex> index_block;
create_context.Create(&index_block, std::move(contents));
return index_block;
}
std::unique_ptr<BlockBuilder> builder_;
};
INSTANTIATE_TEST_CASE_P(
P, IndexBlockKVChecksumTest,
::testing::Combine(
::testing::Values(
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch,
BlockBasedTableOptions::DataBlockIndexType::
kDataBlockBinaryAndHash),
::testing::Values(0, 1, 2, 4, 8), ::testing::Values(1, 3, 8, 16),
::testing::Values(true, false), ::testing::Values(true, false)),
[](const testing::TestParamInfo<
std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
uint32_t, bool, bool>> &args) {
std::ostringstream oss;
oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
<< std::to_string(std::get<1>(args.param)) << "RestartInterval"
<< std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode"
<< std::to_string(std::get<3>(args.param)) << "IncludeFirstKey"
<< std::to_string(std::get<4>(args.param));
return oss.str();
});
TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
Options options = Options();
uint8_t protection_bytes_per_key = GetChecksumLen();
std::vector<int> num_restart_intervals = {1, 16};
std::vector<SequenceNumber> seqnos{kDisableGlobalSequenceNumber, 10001};
for (const auto num_restart_interval : num_restart_intervals) {
const int kNumRecords =
num_restart_interval * static_cast<int>(GetRestartInterval());
for (const auto seqno : seqnos) {
std::vector<std::string> separators;
std::vector<BlockHandle> block_handles;
std::vector<std::string> first_keys;
GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
kNumRecords,
seqno != kDisableGlobalSequenceNumber);
SyncPoint::GetInstance()->DisableProcessing();
std::unique_ptr<Block_kIndex> index_block = GenerateIndexBlock(
separators, block_handles, first_keys, kNumRecords);
IndexBlockIter *kNullIter = nullptr;
Statistics *kNullStats = nullptr;
// read contents of block sequentially
std::unique_ptr<IndexBlockIter> biter{index_block->NewIndexIterator(
options.comparator, seqno, kNullIter, kNullStats,
true /* total_order_seek */, IncludeFirstKey() /* have_first_key */,
true /* key_includes_seq */,
!UseValueDeltaEncoding() /* value_is_full */,
true /* block_contents_pinned */, nullptr /* prefix_index */)};
biter->SeekToFirst();
const char *checksum_ptr = index_block->TEST_GetKVChecksum();
// Check checksum of correct length is generated
for (int i = 0; i < kNumRecords; i++) {
// Obtaining the actual content written as value to index block is not
// trivial: delta-encoded value is only persisted when not at block
// restart point and that keys share some byte (see more in
// BlockBuilder::AddWithLastKeyImpl()). So here we just do verification
// using value from iterator unlike tests for DataBlockIter or
// MetaBlockIter.
ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, checksum_ptr,
biter->key(), biter->raw_value()));
}
size_t verification_count = 0;
// The SyncPoint is placed before checking checksum_len == 0 in
// Block::VerifyChecksum(). To make the testing code below simpler and not
// having to differentiate 0 vs non-0 checksum_len, we do an explicit
// assert checking on checksum_len here.
SyncPoint::GetInstance()->SetCallBack(
"Block::VerifyChecksum::checksum_len",
[&verification_count, protection_bytes_per_key](void *checksum_len) {
ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
protection_bytes_per_key);
++verification_count;
});
SyncPoint::GetInstance()->EnableProcessing();
TestSeekToFirst(biter, verification_count);
TestSeekToLast(biter, verification_count);
TestSeek(biter, verification_count, first_keys[kNumRecords / 2]);
}
}
}
class MetaIndexBlockKVChecksumTest
: public BlockPerKVChecksumTest,
public testing::WithParamInterface<
uint8_t /* block_protection_bytes_per_key */> {
public:
MetaIndexBlockKVChecksumTest() = default;
uint8_t GetChecksumLen() const { return GetParam(); }
uint32_t GetRestartInterval() const { return 1; }
std::unique_ptr<Block_kMetaIndex> GenerateMetaIndexBlock(
std::vector<std::string> &keys, std::vector<std::string> &values,
int num_record) {
Options options = Options();
BlockBasedTableOptions tbo;
uint8_t protection_bytes_per_key = GetChecksumLen();
BlockCreateContext create_context{
&tbo, nullptr /* statistics */, false /* using_zstd */,
protection_bytes_per_key, options.comparator};
builder_ =
std::make_unique<BlockBuilder>(static_cast<int>(GetRestartInterval()));
// add a bunch of records to a block
for (int i = 0; i < num_record; i++) {
builder_->Add(keys[i], values[i]);
}
Slice raw_block = builder_->Finish();
BlockContents contents;
contents.data = raw_block;
std::unique_ptr<Block_kMetaIndex> meta_block;
create_context.Create(&meta_block, std::move(contents));
return meta_block;
}
std::unique_ptr<BlockBuilder> builder_;
};
INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest,
::testing::Values(0, 1, 2, 4, 8),
[](const testing::TestParamInfo<uint8_t> &args) {
std::ostringstream oss;
oss << "ProtBytes" << std::to_string(args.param);
return oss.str();
});
TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
Options options = Options();
BlockBasedTableOptions tbo;
uint8_t protection_bytes_per_key = GetChecksumLen();
BlockCreateContext create_context{
&tbo, nullptr /* statistics */, false /* using_zstd */,
protection_bytes_per_key, options.comparator};
std::vector<int> num_restart_intervals = {1, 16};
for (const auto num_restart_interval : num_restart_intervals) {
const int kNumRecords = num_restart_interval * GetRestartInterval();
std::vector<std::string> keys;
std::vector<std::string> values;
GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
24 /* padding_size */);
SyncPoint::GetInstance()->DisableProcessing();
std::unique_ptr<Block_kMetaIndex> meta_block =
GenerateMetaIndexBlock(keys, values, kNumRecords);
const char *checksum_ptr = meta_block->TEST_GetKVChecksum();
// Check checksum of correct length is generated
for (int i = 0; i < kNumRecords; i++) {
ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key,
checksum_ptr + i * protection_bytes_per_key,
keys[i], values[i]));
}
size_t verification_count = 0;
// The SyncPoint is placed before checking checksum_len == 0 in
// Block::VerifyChecksum(). To make the testing code below simpler and not
// having to differentiate 0 vs non-0 checksum_len, we do an explicit assert
// checking on checksum_len here.
SyncPoint::GetInstance()->SetCallBack(
"Block::VerifyChecksum::checksum_len",
[&verification_count, protection_bytes_per_key](void *checksum_len) {
ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
protection_bytes_per_key);
++verification_count;
});
SyncPoint::GetInstance()->EnableProcessing();
// Check that block iterator does checksum verification
std::unique_ptr<MetaBlockIter> biter{
meta_block->NewMetaIterator(true /* block_contents_pinned */)};
TestSeekToFirst(biter, verification_count);
TestSeekToLast(biter, verification_count);
TestSeek(biter, verification_count, keys[kNumRecords / 2]);
TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]);
}
}
class DataBlockKVChecksumCorruptionTest : public DataBlockKVChecksumTest {
public:
DataBlockKVChecksumCorruptionTest() = default;
std::unique_ptr<DataBlockIter> GenerateDataBlockIter(
std::vector<std::string> &keys, std::vector<std::string> &values,
int num_record) {
// During Block construction, we may create block iter to initialize per kv
// checksum. Disable syncpoint that may be created for block iter methods.
SyncPoint::GetInstance()->DisableProcessing();
block_ = GenerateDataBlock(keys, values, num_record);
std::unique_ptr<DataBlockIter> biter{block_->NewDataIterator(
Options().comparator, kDisableGlobalSequenceNumber)};
SyncPoint::GetInstance()->EnableProcessing();
return biter;
}
protected:
std::unique_ptr<Block_kData> block_;
};
TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) {
std::vector<int> num_restart_intervals = {1, 3};
for (const auto num_restart_interval : num_restart_intervals) {
const int kNumRecords =
num_restart_interval * static_cast<int>(GetRestartInterval());
std::vector<std::string> keys;
std::vector<std::string> values;
GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
24 /* padding_size */);
SyncPoint::GetInstance()->SetCallBack(
"BlockIter::UpdateKey::value", [](void *arg) {
char *value = static_cast<char *>(arg);
// values generated by GenerateRandomKVs are of length 100
++value[10];
});
// Purely for reducing the number of lines of code.
typedef std::unique_ptr<DataBlockIter> IterPtr;
typedef void(IterAPI)(IterPtr & iter, std::string &);
std::string seek_key = keys[kNumRecords / 2];
auto test_seek = [&](IterAPI iter_api) {
IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords);
ASSERT_OK(biter->status());
iter_api(biter, seek_key);
ASSERT_FALSE(biter->Valid());
ASSERT_TRUE(biter->status().IsCorruption());
};
test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); });
test_seek([](IterPtr &iter, std::string &k) { iter->SeekForGet(k); });
typedef void (DataBlockIter::*IterStepAPI)();
auto test_step = [&](IterStepAPI iter_api, std::string &k) {
IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords);
SyncPoint::GetInstance()->DisableProcessing();
biter->Seek(k);
ASSERT_TRUE(biter->Valid());
ASSERT_OK(biter->status());
SyncPoint::GetInstance()->EnableProcessing();
std::invoke(iter_api, biter);
ASSERT_FALSE(biter->Valid());
ASSERT_TRUE(biter->status().IsCorruption());
};
if (kNumRecords > 1) {
test_step(&DataBlockIter::Prev, seek_key);
test_step(&DataBlockIter::Next, seek_key);
}
}
}
INSTANTIATE_TEST_CASE_P(
P, DataBlockKVChecksumCorruptionTest,
::testing::Combine(
::testing::Values(
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch,
BlockBasedTableOptions::DataBlockIndexType::
kDataBlockBinaryAndHash),
::testing::Values(4, 8) /* block_protection_bytes_per_key */,
::testing::Values(1, 3, 8, 16) /* restart_interval */,
::testing::Values(false, true)),
[](const testing::TestParamInfo<std::tuple<
BlockBasedTableOptions::DataBlockIndexType, uint8_t, uint32_t, bool>>
&args) {
std::ostringstream oss;
oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
<< std::to_string(std::get<1>(args.param)) << "RestartInterval"
<< std::to_string(std::get<2>(args.param)) << "DeltaEncode"
<< std::to_string(std::get<3>(args.param));
return oss.str();
});
class IndexBlockKVChecksumCorruptionTest : public IndexBlockKVChecksumTest {
public:
IndexBlockKVChecksumCorruptionTest() = default;
std::unique_ptr<IndexBlockIter> GenerateIndexBlockIter(
std::vector<std::string> &separators,
std::vector<BlockHandle> &block_handles,
std::vector<std::string> &first_keys, int num_record,
SequenceNumber seqno) {
SyncPoint::GetInstance()->DisableProcessing();
block_ =
GenerateIndexBlock(separators, block_handles, first_keys, num_record);
std::unique_ptr<IndexBlockIter> biter{block_->NewIndexIterator(
Options().comparator, seqno, nullptr, nullptr,
true /* total_order_seek */, IncludeFirstKey() /* have_first_key */,
true /* key_includes_seq */,
!UseValueDeltaEncoding() /* value_is_full */,
true /* block_contents_pinned */, nullptr /* prefix_index */)};
SyncPoint::GetInstance()->EnableProcessing();
return biter;
}
protected:
std::unique_ptr<Block_kIndex> block_;
};
INSTANTIATE_TEST_CASE_P(
P, IndexBlockKVChecksumCorruptionTest,
::testing::Combine(
::testing::Values(
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch,
BlockBasedTableOptions::DataBlockIndexType::
kDataBlockBinaryAndHash),
::testing::Values(4, 8) /* block_protection_bytes_per_key */,
::testing::Values(1, 3, 8, 16) /* restart_interval */,
::testing::Values(true, false), ::testing::Values(true, false)),
[](const testing::TestParamInfo<
std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
uint32_t, bool, bool>> &args) {
std::ostringstream oss;
oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
<< std::to_string(std::get<1>(args.param)) << "RestartInterval"
<< std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode"
<< std::to_string(std::get<3>(args.param)) << "IncludeFirstKey"
<< std::to_string(std::get<4>(args.param));
return oss.str();
});
TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) {
std::vector<int> num_restart_intervals = {1, 3};
std::vector<SequenceNumber> seqnos{kDisableGlobalSequenceNumber, 10001};
for (const auto num_restart_interval : num_restart_intervals) {
const int kNumRecords =
num_restart_interval * static_cast<int>(GetRestartInterval());
for (const auto seqno : seqnos) {
std::vector<std::string> separators;
std::vector<BlockHandle> block_handles;
std::vector<std::string> first_keys;
GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
kNumRecords,
seqno != kDisableGlobalSequenceNumber);
SyncPoint::GetInstance()->SetCallBack(
"BlockIter::UpdateKey::value", [](void *arg) {
char *value = static_cast<char *>(arg);
// value can be delta-encoded with different lengths, so we corrupt
// first bytes here to be safe
++value[0];
});
typedef std::unique_ptr<IndexBlockIter> IterPtr;
typedef void(IterAPI)(IterPtr & iter, std::string &);
std::string seek_key = first_keys[kNumRecords / 2];
auto test_seek = [&](IterAPI iter_api) {
std::unique_ptr<IndexBlockIter> biter = GenerateIndexBlockIter(
separators, block_handles, first_keys, kNumRecords, seqno);
ASSERT_OK(biter->status());
iter_api(biter, seek_key);
ASSERT_FALSE(biter->Valid());
ASSERT_TRUE(biter->status().IsCorruption());
};
test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
typedef void (IndexBlockIter::*IterStepAPI)();
auto test_step = [&](IterStepAPI iter_api, std::string &k) {
std::unique_ptr<IndexBlockIter> biter = GenerateIndexBlockIter(
separators, block_handles, first_keys, kNumRecords, seqno);
SyncPoint::GetInstance()->DisableProcessing();
biter->Seek(k);
ASSERT_TRUE(biter->Valid());
ASSERT_OK(biter->status());
SyncPoint::GetInstance()->EnableProcessing();
std::invoke(iter_api, biter);
ASSERT_FALSE(biter->Valid());
ASSERT_TRUE(biter->status().IsCorruption());
};
if (kNumRecords > 1) {
test_step(&IndexBlockIter::Prev, seek_key);
test_step(&IndexBlockIter::Next, seek_key);
}
}
}
}
class MetaIndexBlockKVChecksumCorruptionTest
: public MetaIndexBlockKVChecksumTest {
public:
MetaIndexBlockKVChecksumCorruptionTest() = default;
std::unique_ptr<MetaBlockIter> GenerateMetaIndexBlockIter(
std::vector<std::string> &keys, std::vector<std::string> &values,
int num_record) {
SyncPoint::GetInstance()->DisableProcessing();
block_ = GenerateMetaIndexBlock(keys, values, num_record);
std::unique_ptr<MetaBlockIter> biter{
block_->NewMetaIterator(true /* block_contents_pinned */)};
SyncPoint::GetInstance()->EnableProcessing();
return biter;
}
protected:
std::unique_ptr<Block_kMetaIndex> block_;
};
INSTANTIATE_TEST_CASE_P(
P, MetaIndexBlockKVChecksumCorruptionTest,
::testing::Values(4, 8) /* block_protection_bytes_per_key */,
[](const testing::TestParamInfo<uint8_t> &args) {
std::ostringstream oss;
oss << "ProtBytes" << std::to_string(args.param);
return oss.str();
});
TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) {
Options options = Options();
std::vector<int> num_restart_intervals = {1, 3};
for (const auto num_restart_interval : num_restart_intervals) {
const int kNumRecords =
num_restart_interval * static_cast<int>(GetRestartInterval());
std::vector<std::string> keys;
std::vector<std::string> values;
GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
24 /* padding_size */);
SyncPoint::GetInstance()->SetCallBack(
"BlockIter::UpdateKey::value", [](void *arg) {
char *value = static_cast<char *>(arg);
// values generated by GenerateRandomKVs are of length 100
++value[10];
});
typedef std::unique_ptr<MetaBlockIter> IterPtr;
typedef void(IterAPI)(IterPtr & iter, std::string &);
typedef void (MetaBlockIter::*IterStepAPI)();
std::string seek_key = keys[kNumRecords / 2];
auto test_seek = [&](IterAPI iter_api) {
IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords);
ASSERT_OK(biter->status());
iter_api(biter, seek_key);
ASSERT_FALSE(biter->Valid());
ASSERT_TRUE(biter->status().IsCorruption());
};
test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); });
auto test_step = [&](IterStepAPI iter_api, const std::string &k) {
IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords);
SyncPoint::GetInstance()->DisableProcessing();
biter->Seek(k);
ASSERT_TRUE(biter->Valid());
ASSERT_OK(biter->status());
SyncPoint::GetInstance()->EnableProcessing();
std::invoke(iter_api, biter);
ASSERT_FALSE(biter->Valid());
ASSERT_TRUE(biter->status().IsCorruption());
};
if (kNumRecords > 1) {
test_step(&MetaBlockIter::Prev, seek_key);
test_step(&MetaBlockIter::Next, seek_key);
}
}
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char **argv) {

@ -581,8 +581,9 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
const bool kImmortal = true;
ASSERT_OK(ioptions.table_factory->NewTableReader(
TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
internal_comparator, !kSkipFilters, !kImmortal,
level_),
internal_comparator,
0 /* block_protection_bytes_per_key */, !kSkipFilters,
!kImmortal, level_),
std::move(file_reader), sink->contents().size(), &table_reader));
// Search using Get()
ReadOptions ro;

@ -266,9 +266,9 @@ class BlockFetcherTest : public testing::Test {
const auto* table_options =
table_factory_.GetOptions<BlockBasedTableOptions>();
ASSERT_NE(table_options, nullptr);
ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options,
comparator, std::move(file), file_size,
&table_reader));
ASSERT_OK(BlockBasedTable::Open(
ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file),
file_size, 0 /* block_protection_bytes_per_key */, &table_reader));
table->reset(reinterpret_cast<BlockBasedTable*>(table_reader.release()));
}

@ -165,10 +165,10 @@ Status SstFileDumper::NewTableReader(
const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
std::unique_ptr<TableReader>* /*table_reader*/) {
auto t_opt =
TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_,
internal_comparator_, false /* skip_filters */,
false /* imortal */, true /* force_direct_prefetch */);
auto t_opt = TableReaderOptions(
ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_,
0 /* block_protection_bytes_per_key */, false /* skip_filters */,
false /* immortal */, true /* force_direct_prefetch */);
// Allow open file with global sequence number for backward compatibility.
t_opt.largest_seqno = kMaxSequenceNumber;

@ -56,7 +56,8 @@ Status SstFileReader::Open(const std::string& file_path) {
}
if (s.ok()) {
TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor,
r->soptions, r->ioptions.internal_comparator);
r->soptions, r->ioptions.internal_comparator,
r->moptions.block_protection_bytes_per_key);
// Allow open file with global sequence number for backward compatibility.
t_opt.largest_seqno = kMaxSequenceNumber;
s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader),

@ -37,9 +37,9 @@ struct TableReaderOptions {
const std::shared_ptr<const SliceTransform>& _prefix_extractor,
const EnvOptions& _env_options,
const InternalKeyComparator& _internal_comparator,
bool _skip_filters = false, bool _immortal = false,
bool _force_direct_prefetch = false, int _level = -1,
BlockCacheTracer* const _block_cache_tracer = nullptr,
uint8_t _block_protection_bytes_per_key, bool _skip_filters = false,
bool _immortal = false, bool _force_direct_prefetch = false,
int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr,
size_t _max_file_size_for_l0_meta_pin = 0,
const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0,
UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0)
@ -56,7 +56,8 @@ struct TableReaderOptions {
max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin),
cur_db_session_id(_cur_db_session_id),
cur_file_num(_cur_file_num),
unique_id(_unique_id) {}
unique_id(_unique_id),
block_protection_bytes_per_key(_block_protection_bytes_per_key) {}
const ImmutableOptions& ioptions;
const std::shared_ptr<const SliceTransform>& prefix_extractor;
@ -86,6 +87,8 @@ struct TableReaderOptions {
// Known unique_id or {}, kNullUniqueId64x2 means unknown
UniqueId64x2 unique_id;
uint8_t block_protection_bytes_per_key;
};
struct TableBuilderOptions {

@ -144,7 +144,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
new RandomAccessFileReader(std::move(raf), file_name));
s = opts.table_factory->NewTableReader(
TableReaderOptions(ioptions, moptions.prefix_extractor, env_options,
ikc),
ikc, 0 /* block_protection_bytes_per_key */),
std::move(file_reader), file_size, &table_reader);
if (!s.ok()) {
fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());

@ -444,7 +444,9 @@ class TableConstructor : public Constructor {
file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
return ioptions.table_factory->NewTableReader(
TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
*last_internal_comparator_, /*skip_filters*/ false,
*last_internal_comparator_,
0 /* block_protection_bytes_per_key */,
/*skip_filters*/ false,
/*immortal*/ false, false, level_,
&block_cache_tracer_, moptions.write_buffer_size, "",
file_num_, kNullUniqueId64x2, largest_seqno_),
@ -4795,7 +4797,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
options.table_factory->NewTableReader(
TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
ikc),
ikc, 0 /* block_protection_bytes_per_key */),
std::move(file_reader), ss_rw.contents().size(), &table_reader);
return table_reader->NewIterator(
@ -4964,7 +4966,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
ASSERT_OK(ioptions.table_factory->NewTableReader(
TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(),
GetPlainInternalComparator(options2.comparator)),
GetPlainInternalComparator(options2.comparator),
0 /* block_protection_bytes_per_key */),
std::move(file_reader), sink->contents().size(), &table_reader));
ReadOptions read_options;

@ -1725,6 +1725,10 @@ DEFINE_uint32(
"This options determines the size of such checksums. "
"Supported values: 0, 1, 2, 4, 8.");
DEFINE_uint32(block_protection_bytes_per_key, 0,
"Enable block per key-value checksum protection. "
"Supported values: 0, 1, 2, 4, 8.");
DEFINE_bool(build_info, false,
"Print the build info via GetRocksBuildInfoAsString");
@ -4565,6 +4569,8 @@ class Benchmark {
}
options.memtable_protection_bytes_per_key =
FLAGS_memtable_protection_bytes_per_key;
options.block_protection_bytes_per_key =
FLAGS_block_protection_bytes_per_key;
}
void InitializeOptionsGeneral(Options* opts) {

@ -37,6 +37,7 @@ default_params = {
"backup_one_in": 100000,
"batch_protection_bytes_per_key": lambda: random.choice([0, 8]),
"memtable_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]),
"block_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]),
"block_size": 16384,
"bloom_bits": lambda: random.choice(
[random.randint(0, 19), random.lognormvariate(2.3, 1.3)]

Loading…
Cancel
Save