From 920386f2b7ef0c5b22c55f63ad8edf24b499120f Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Tue, 1 Feb 2022 17:41:20 -0800 Subject: [PATCH] Detect (new) Bloom/Ribbon Filter construction corruption (#9342) Summary: Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393 **Context:** (Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following: a) set of keys to add to filter b) set of hashes to add to filter (64-bit hash applied to each key) c) set of Bloom indices to set in filter, with duplicates d) set of Bloom indices to set in filter, deduplicated e) final filter and its checksum This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level. - b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`) - c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO. Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default. **Summary:** - Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()` - Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption - See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design - Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries` - Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()` - When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342 Test Plan: - Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption` - Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl - For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break. - Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()` - FastLocalBloom - Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s** - After change: - `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)** - Standard128Ribbon - Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s** - After change: - `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)** - Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true` - Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status. Reviewed By: pdillinger Differential Revision: D33746928 Pulled By: hx235 fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57 --- HISTORY.md | 3 + db/db_bloom_filter_test.cc | 312 +++++++++++++-- db_stress_tool/db_stress_common.h | 1 + db_stress_tool/db_stress_gflags.cc | 6 + db_stress_tool/db_stress_test_base.cc | 2 + include/rocksdb/filter_policy.h | 31 ++ include/rocksdb/table.h | 16 +- options/options_settable_test.cc | 3 +- options/options_test.cc | 4 +- .../block_based/block_based_table_builder.cc | 17 +- .../block_based/block_based_table_factory.cc | 5 + table/block_based/filter_block.h | 12 +- table/block_based/filter_policy.cc | 363 ++++++++++++++---- table/block_based/filter_policy_internal.h | 15 +- table/block_based/full_filter_block.cc | 4 +- table/block_based/full_filter_block.h | 4 + table/block_based/full_filter_block_test.cc | 6 + table/block_based/partitioned_filter_block.cc | 23 +- table/block_based/partitioned_filter_block.h | 25 +- tools/db_crashtest.py | 1 + util/filter_bench.cc | 20 +- 21 files changed, 740 insertions(+), 133 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index cde4bc73c..cd5ffe43d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -26,6 +26,9 @@ * Disallow the combination of DBOptions.use_direct_io_for_flush_and_compaction == true and DBOptions.writable_file_max_buffer_size == 0. This combination can cause WritableFileWriter::Append() to loop forever, and it does not make much sense in direct IO. * `ReadOptions::total_order_seek` no longer affects `DB::Get()`. The original motivation for this interaction has been obsolete since RocksDB has been able to detect whether the current prefix extractor is compatible with that used to generate table files, probably RocksDB 5.14.0. +## New Features +* Introduced an option `BlockBasedTableBuilder::detect_filter_construct_corruption` for detecting corruption during Bloom Filter (format_version >= 5) and Ribbon Filter construction. + ## 6.29.0 (01/21/2022) Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info. ### Public API change diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 2ae4ff3bd..db621912d 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include #include #include @@ -17,6 +18,8 @@ #include "port/stack_trace.h" #include "rocksdb/convenience.h" #include "rocksdb/perf_context.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_reader.h" #include "table/block_based/filter_policy_internal.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -776,7 +779,7 @@ using FilterConstructionReserveMemoryHash = uint64_t; class DBFilterConstructionReserveMemoryTestWithParam : public DBTestBase, public testing::WithParamInterface< - std::tuple> { + std::tuple> { public: DBFilterConstructionReserveMemoryTestWithParam() : DBTestBase("db_bloom_filter_tests", @@ -784,7 +787,8 @@ class DBFilterConstructionReserveMemoryTestWithParam num_key_(0), reserve_table_builder_memory_(std::get<0>(GetParam())), policy_(std::get<1>(GetParam())), - partition_filters_(std::get<2>(GetParam())) { + partition_filters_(std::get<2>(GetParam())), + detect_filter_construct_corruption_(std::get<3>(GetParam())) { if (!reserve_table_builder_memory_ || policy_ == BloomFilterPolicy::Mode::kDeprecatedBlock || policy_ == BloomFilterPolicy::Mode::kLegacyBloom) { @@ -839,6 +843,8 @@ class DBFilterConstructionReserveMemoryTestWithParam // entries and final filter. table_options.metadata_block_size = 409000; } + table_options.detect_filter_construct_corruption = + detect_filter_construct_corruption_; LRUCacheOptions lo; lo.capacity = kCacheCapacity; @@ -870,20 +876,37 @@ class DBFilterConstructionReserveMemoryTestWithParam BloomFilterPolicy::Mode policy_; bool partition_filters_; std::shared_ptr cache_; + bool detect_filter_construct_corruption_; }; INSTANTIATE_TEST_CASE_P( BlockBasedTableOptions, DBFilterConstructionReserveMemoryTestWithParam, ::testing::Values( - std::make_tuple(false, BloomFilterPolicy::Mode::kFastLocalBloom, false), - std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, false), - std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, true), - std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + std::make_tuple(false, BloomFilterPolicy::Mode::kFastLocalBloom, false, false), + + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, false, + false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, false, + true), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, true, + false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, true, + true), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + false, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + false, true), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, true, + false), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, true, true), - std::make_tuple(true, BloomFilterPolicy::Mode::kDeprecatedBlock, false), - std::make_tuple(true, BloomFilterPolicy::Mode::kLegacyBloom, false))); + + std::make_tuple(true, BloomFilterPolicy::Mode::kDeprecatedBlock, false, + false), + std::make_tuple(true, BloomFilterPolicy::Mode::kLegacyBloom, false, + false))); // TODO: Speed up this test. // The current test inserts many keys (on the scale of dummy entry size) @@ -919,8 +942,8 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { // filter construction cache reservation, flush won't be triggered before we // manually trigger it for clean testing options.write_buffer_size = 640 << 20; - options.table_factory.reset( - NewBlockBasedTableFactory(GetBlockBasedTableOptions())); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::shared_ptr cache = GetFilterConstructResPeakTrackingCache(); options.create_if_missing = true; @@ -943,6 +966,8 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { bool reserve_table_builder_memory = ReserveTableBuilderMemory(); BloomFilterPolicy::Mode policy = GetFilterPolicy(); bool partition_filters = PartitionFilters(); + bool detect_filter_construct_corruption = + table_options.detect_filter_construct_corruption; std::deque filter_construction_cache_res_peaks = cache->GetReservedCachePeaks(); @@ -999,6 +1024,12 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { * multiple of dummy entries so that reservation for (p0 - b) * will trigger at least another dummy entry insertion. * + * BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter + + * detect_filter_construct_corruption + * The peak p0 stays the same as + * (BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter) but just lasts + * longer since we release hash entries reservation later. + * * BloomFilterPolicy::Mode::kFastLocalBloom + PartitionedFilter * p1 * / \ @@ -1016,6 +1047,12 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { * + parittioned final filter1 + parittioned final filter2 * = hash entries + final filter * + * BloomFilterPolicy::Mode::kFastLocalBloom + PartitionedFilter + + * detect_filter_construct_corruption + * The peak p0, p1 stay the same as + * (BloomFilterPolicy::Mode::kFastLocalBloom + PartitionedFilter) but just + * last longer since we release hash entries reservation later. + * */ if (!partition_filters) { EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) @@ -1067,7 +1104,28 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { * will trigger at least another dummy entry insertion * (or equivelantly to saying, creating another peak). * - * BloomFilterPolicy::Mode::kStandard128Ribbon + PartitionedFilter + * BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter + + * detect_filter_construct_corruption + * + * new p0 + * / \ + * / \ + * pre p0 \ + * / \ + * / \ + * b / \ + * / \ + * 0/ \ + * hash entries = b - 0, banding = pre p0 - b, + * final filter = new p0 - pre p0 + * new p0 = hash entries + banding + final filter + * + * The previous p0 will no longer be a peak since under + * detect_filter_construct_corruption == true, we do not release hash + * entries reserveration (like p0 - b' previously) until after final filter + * creation and post-verification + * + * BloomFilterPolicy::Mode::kStandard128Ribbon + PartitionedFilter * p3 * p0 /\ p4 * / \ p1 / \ /\ @@ -1085,6 +1143,38 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { * + parittioned banding1 + parittioned banding2 * + parittioned final filter1 + parittioned final filter2 * = hash entries + banding + final filter + * + * BloomFilterPolicy::Mode::kStandard128Ribbon + PartitionedFilter + + * detect_filter_construct_corruption + * + * new p3 + * / \ + * pre p3 \ + * new p0 / \ + * / \ / \ + * pre p0 \ / \ + * / \ b'/ \ + * / \ / \ + * b / \ / \ + * / \a \ + * 0/ \ + * partitioned hash entries1 = b - 0, partitioned hash entries2 = b' - a + * partitioned banding1 = pre p0 - b, partitioned banding2 = pre p3 - b' + * parittioned final filter1 = new p0 - pre p0, + * parittioned final filter2 = new p3 - pre p3 + * + * The previous p0 and p3 will no longer be a peak since under + * detect_filter_construct_corruption == true, we do not release hash + * entries reserveration (like p0 - b', p3 - a' previously) until after + * parittioned final filter creation and post-verification + * + * However, increments sum stay the same as shown below: + * (increment new p0 - 0) + (increment new p3 - a) + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned banding1 + parittioned banding2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + banding + final filter + * */ if (!partition_filters) { ASSERT_GE(std::floor(1.0 * predicted_final_filter_cache_res / @@ -1092,28 +1182,59 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { 1) << "Final filter cache reservation too small for this test - please " "increase the number of keys"; - EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2) - << "Filter construction cache reservation should have 2 peaks in " - "case: BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter. " - "The second peak is resulted from charging the final filter after " - "decreasing the hash entry reservation since the testing final " - "filter reservation is designed to be at least 1 dummy entry size"; - - std::size_t filter_construction_cache_res_peak = - filter_construction_cache_res_peaks[0]; - std::size_t predicted_filter_construction_cache_res_peak = - predicted_hash_entries_cache_res + predicted_banding_cache_res; - EXPECT_GE(filter_construction_cache_res_peak, - predicted_filter_construction_cache_res_peak * 0.9); - EXPECT_LE(filter_construction_cache_res_peak, - predicted_filter_construction_cache_res_peak * 1.1); + if (!detect_filter_construct_corruption) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have 2 peaks in " + "case: BloomFilterPolicy::Mode::kStandard128Ribbon + " + "FullFilter. " + "The second peak is resulted from charging the final filter " + "after " + "decreasing the hash entry reservation since the testing final " + "filter reservation is designed to be at least 1 dummy entry " + "size"; + + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_banding_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + } else { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) + << "Filter construction cache reservation should have 1 peaks in " + "case: BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter " + "+ detect_filter_construct_corruption. " + "The previous second peak now disappears since we don't " + "decrease the hash entry reservation" + "until after final filter reservation and post-verification"; + + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_banding_cache_res + + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + } return; } else { - EXPECT_GE(filter_construction_cache_res_peaks.size(), 3) - << "Filter construction cache reservation should have more than 3 " - "peaks " - "in case: BloomFilterPolicy::Mode::kStandard128Ribbon + " - "PartitionedFilter"; + if (!detect_filter_construct_corruption) { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 3) + << "Filter construction cache reservation should have more than 3 " + "peaks " + "in case: BloomFilterPolicy::Mode::kStandard128Ribbon + " + "PartitionedFilter"; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have more than 2 " + "peaks " + "in case: BloomFilterPolicy::Mode::kStandard128Ribbon + " + "PartitionedFilter + detect_filter_construct_corruption"; + } std::size_t predicted_filter_construction_cache_res_increments_sum = predicted_hash_entries_cache_res + predicted_banding_cache_res + predicted_final_filter_cache_res; @@ -1126,6 +1247,135 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { } } +class DBFilterConstructionCorruptionTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBFilterConstructionCorruptionTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true) {} + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + table_options.detect_filter_construct_corruption = std::get<0>(GetParam()); + table_options.filter_policy.reset( + new BloomFilterPolicy(10, std::get<1>(GetParam()))); + table_options.partition_filters = std::get<2>(GetParam()); + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size small enough so we can + // trigger filter partitioning with GetNumKey() amount of keys + table_options.metadata_block_size = 10; + } + + return table_options; + } + + // Return an appropriate amount of keys for testing + // to generate a long filter (i.e, size >= 8 + kMetadataLen) + std::size_t GetNumKey() { return 5000; } +}; + +INSTANTIATE_TEST_CASE_P( + DBFilterConstructionCorruptionTestWithParam, + DBFilterConstructionCorruptionTestWithParam, + ::testing::Values( + std::make_tuple(false, BloomFilterPolicy::Mode::kFastLocalBloom, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, true), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + false), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + true))); + +TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + int num_key = static_cast(GetNumKey()); + Status s; + + // Case 1: No corruption in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + s = Flush(); + EXPECT_TRUE(s.ok()); + + // Case 2: Corruption of hash entries in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE( + s.ToString().find("Filter's hash entries checksum mismatched") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + // Case 3: Corruption of filter content in filter construction + DestroyAndReopen(options); + + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) { + std::pair*, std::size_t>* TEST_arg_pair = + (std::pair*, std::size_t>*)arg; + std::size_t filter_size = TEST_arg_pair->second; + // 5 is the kMetadataLen and + assert(filter_size >= 8 + 5); + std::unique_ptr* filter_content_to_corrupt = + TEST_arg_pair->first; + std::memset(filter_content_to_corrupt->get(), '\0', 8); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(s.ToString().find("Corrupted filter content") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperFilter"); +} + namespace { // A wrapped bloom over block-based FilterPolicy class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy { diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index a3b6e80eb..22e3c9e43 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -149,6 +149,7 @@ DECLARE_bool(use_block_based_filter); DECLARE_int32(ribbon_starting_level); DECLARE_bool(partition_filters); DECLARE_bool(optimize_filters_for_memory); +DECLARE_bool(detect_filter_construct_corruption); DECLARE_int32(index_type); DECLARE_string(db); DECLARE_string(secondaries_base); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 53e816960..8d9a04427 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -453,6 +453,12 @@ DEFINE_bool( ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory, "Minimize memory footprint of filters"); +DEFINE_bool( + detect_filter_construct_corruption, + ROCKSDB_NAMESPACE::BlockBasedTableOptions() + .detect_filter_construct_corruption, + "Detect corruption during new Bloom Filter and Ribbon Filter construction"); + DEFINE_int32( index_type, static_cast( diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 84922ccb2..6f532c888 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2272,6 +2272,8 @@ void StressTest::Open() { block_based_options.partition_filters = FLAGS_partition_filters; block_based_options.optimize_filters_for_memory = FLAGS_optimize_filters_for_memory; + block_based_options.detect_filter_construct_corruption = + FLAGS_detect_filter_construct_corruption; block_based_options.index_type = static_cast(FLAGS_index_type); block_based_options.prepopulate_block_cache = diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 01a29f27a..38f97ede4 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -65,6 +65,37 @@ class FilterBitsBuilder { // The ownership of actual data is set to buf virtual Slice Finish(std::unique_ptr* buf) = 0; + // Similar to Finish(std::unique_ptr* buf), except that + // for a non-null status pointer argument, it will point to + // Status::Corruption() when there is any corruption during filter + // construction or Status::OK() otherwise. + // + // WARNING: do not use a filter resulted from a corrupted construction + virtual Slice Finish(std::unique_ptr* buf, + Status* /* status */) { + return Finish(buf); + } + + // Verify the filter returned from calling FilterBitsBuilder::Finish. + // The function returns Status::Corruption() if there is any corruption in the + // constructed filter or Status::OK() otherwise. + // + // Implementations should normally consult + // FilterBuildingContext::table_options.detect_filter_construct_corruption + // to determine whether to perform verification or to skip by returning + // Status::OK(). The decision is left to the FilterBitsBuilder so that + // verification prerequisites before PostVerify can be skipped when not + // configured. + // + // RocksDB internal will always call MaybePostVerify() on the filter after + // it is returned from calling FilterBitsBuilder::Finish + // except for FilterBitsBuilder::Finish resulting a corruption + // status, which indicates the filter is already in a corrupted state and + // there is no need to post-verify + virtual Status MaybePostVerify(const Slice& /* filter_content */) { + return Status::OK(); + } + // Approximate the number of keys that can be added and generate a filter // <= the specified number of bytes. Callers (including RocksDB) should // only use this result for optimizing performance and not as a guarantee. diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 1e48eca72..e2daa2343 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -293,16 +293,16 @@ struct BlockBasedTableOptions { // the memory, if block cache available. // // Charged memory usage includes: - // 1. (new) Bloom Filter and Ribbon Filter construction + // 1. Bloom Filter (format_version >= 5) and Ribbon Filter construction // 2. More to come... // // Note: - // 1. (new) Bloom Filter and Ribbon Filter construction + // 1. Bloom Filter (format_version >= 5) and Ribbon Filter construction // // If additional temporary memory of Ribbon Filter uses up too much memory // relative to the avaible space left in the block cache // at some point (i.e, causing a cache full when strict_capacity_limit = - // true), construction will fall back to (new) Bloom Filter. + // true), construction will fall back to Bloom Filter. // // Default: false bool reserve_table_builder_memory = false; @@ -365,6 +365,16 @@ struct BlockBasedTableOptions { // This must generally be true for gets to be efficient. bool whole_key_filtering = true; + // If true, detect corruption during Bloom Filter (format_version >= 5) + // and Ribbon Filter construction. + // + // This is an extra check that is only + // useful in detecting software bugs or CPU+memory malfunction. + // Turning on this feature increases filter construction time by 30%. + // + // TODO: optimize this performance + bool detect_filter_construct_corruption = false; + // Verify that decompressing the compressed block gives back the input. This // is a verification mode that we use to detect bugs in compression // algorithms. diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 64930dcee..00899018e 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -174,7 +174,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "partition_filters=false;" "optimize_filters_for_memory=true;" "index_block_restart_interval=4;" - "filter_policy=bloomfilter:4:true;whole_key_filtering=1;" + "filter_policy=bloomfilter:4:true;whole_key_filtering=1;detect_filter_" + "construct_corruption=false;" "reserve_table_builder_memory=false;" "format_version=1;" "hash_index_allow_collision=false;" diff --git a/options/options_test.cc b/options/options_test.cc index ad2cca7e9..99cd7019f 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -856,7 +856,8 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { "block_size_deviation=8;block_restart_interval=4;" "format_version=5;whole_key_filtering=1;" "reserve_table_builder_memory=true;" - "filter_policy=bloomfilter:4.567:false;" + "filter_policy=bloomfilter:4.567:false;detect_filter_construct_" + "corruption=true;" // A bug caused read_amp_bytes_per_bit to be a large integer in OPTIONS // file generated by 6.10 to 6.14. Though bug is fixed in these releases, // we need to handle the case of loading OPTIONS file generated before the @@ -876,6 +877,7 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_restart_interval, 4); ASSERT_EQ(new_opt.format_version, 5U); ASSERT_EQ(new_opt.whole_key_filtering, true); + ASSERT_EQ(new_opt.detect_filter_construct_corruption, true); ASSERT_EQ(new_opt.reserve_table_builder_memory, true); ASSERT_TRUE(new_opt.filter_policy != nullptr); const BloomFilterPolicy* bfp = diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 8c2d41f69..b4a4cfca6 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -1252,6 +1252,15 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, uint32_t checksum = ComputeBuiltinChecksumWithLastByte( r->table_options.checksum, block_contents.data(), block_contents.size(), /*last_byte*/ type); + + if (block_type == BlockType::kFilter) { + Status s = r->filter_builder->MaybePostVerifyFilter(block_contents); + if (!s.ok()) { + r->SetStatus(s); + return; + } + } + EncodeFixed32(trailer.data() + 1, checksum); TEST_SYNC_POINT_CALLBACK( "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum", @@ -1552,7 +1561,13 @@ void BlockBasedTableBuilder::WriteFilterBlock( std::unique_ptr filter_data; Slice filter_content = rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data); - assert(s.ok() || s.IsIncomplete()); + + assert(s.ok() || s.IsIncomplete() || s.IsCorruption()); + if (s.IsCorruption()) { + rep_->SetStatus(s); + break; + } + rep_->props.filter_size += filter_content.size(); // TODO: Refactor code so that BlockType can determine both the C++ type diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 5b1bd2e68..d4b75f2ec 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -356,6 +356,11 @@ static std::unordered_map {offsetof(struct BlockBasedTableOptions, whole_key_filtering), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"detect_filter_construct_corruption", + {offsetof(struct BlockBasedTableOptions, + detect_filter_construct_corruption), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"reserve_table_builder_memory", {offsetof(struct BlockBasedTableOptions, reserve_table_builder_memory), OptionType::kBoolean, OptionVerificationType::kNormal, diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 765eb0244..b2c82fc2c 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -83,9 +83,17 @@ class FilterBlockBuilder { , Status* status, std::unique_ptr* filter_data = nullptr) = 0; - // It is for releasing the memory usage and cache reservation of filter bits - // builder in FullFilter and PartitionedFilter + // This is called when finishes using the FilterBitsBuilder + // in order to release memory usage and cache reservation + // associated with it timely virtual void ResetFilterBitsBuilder() {} + + // To optionally post-verify the filter returned from + // FilterBlockBuilder::Finish. + // Return Status::OK() if skipped. + virtual Status MaybePostVerifyFilter(const Slice& /* filter_content */) { + return Status::OK(); + } }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 4e49fbbdd..f916ab4b4 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -10,6 +10,7 @@ #include "rocksdb/filter_policy.h" #include +#include #include #include #include @@ -47,15 +48,22 @@ Slice FinishAlwaysFalse(std::unique_ptr* /*buf*/) { return Slice(nullptr, 0); } +Slice FinishAlwaysTrue(std::unique_ptr* /*buf*/) { + return Slice("\0\0\0\0\0\0", 6); +} + // Base class for filter builders using the XXH3 preview hash, // also known as Hash64 or GetSliceHash64. class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { public: explicit XXPH3FilterBitsBuilder( std::atomic* aggregate_rounding_balance, - std::shared_ptr cache_res_mgr) + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption) : aggregate_rounding_balance_(aggregate_rounding_balance), - cache_res_mgr_(cache_res_mgr) {} + cache_res_mgr_(cache_res_mgr), + detect_filter_construct_corruption_( + detect_filter_construct_corruption) {} ~XXPH3FilterBitsBuilder() override {} @@ -65,27 +73,34 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // though only adjacent repetition, which we want to immediately // recognize and collapse for estimating true filter space // requirements. - if (hash_entries_.empty() || hash != hash_entries_.back()) { - hash_entries_.push_back(hash); + if (hash_entries_info_.entries.empty() || + hash != hash_entries_info_.entries.back()) { + if (detect_filter_construct_corruption_) { + hash_entries_info_.xor_checksum ^= hash; + } + hash_entries_info_.entries.push_back(hash); if (cache_res_mgr_ && // Traditional rounding to whole bucket size - ((hash_entries_.size() % kUint64tHashEntryCacheResBucketSize) == + ((hash_entries_info_.entries.size() % + kUint64tHashEntryCacheResBucketSize) == kUint64tHashEntryCacheResBucketSize / 2)) { - hash_entry_cache_res_bucket_handles_.emplace_back(nullptr); + hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); Status s = cache_res_mgr_ ->MakeCacheReservation( kUint64tHashEntryCacheResBucketSize * sizeof(hash), - &hash_entry_cache_res_bucket_handles_.back()); + &hash_entries_info_.cache_res_bucket_handles.back()); s.PermitUncheckedError(); } } } virtual size_t EstimateEntriesAdded() override { - return hash_entries_.size(); + return hash_entries_info_.entries.size(); } + virtual Status MaybePostVerify(const Slice& filter_content) override; + protected: static constexpr uint32_t kMetadataLen = 5; @@ -96,13 +111,12 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // For delegating between XXPH3FilterBitsBuilders void SwapEntriesWith(XXPH3FilterBitsBuilder* other) { - std::swap(hash_entries_, other->hash_entries_); - if (cache_res_mgr_) { - std::swap(hash_entry_cache_res_bucket_handles_, - other->hash_entry_cache_res_bucket_handles_); - } + assert(other != nullptr); + hash_entries_info_.Swap(&(other->hash_entries_info_)); } + void ResetEntries() { hash_entries_info_.Reset(); } + virtual size_t RoundDownUsableSpace(size_t available_size) = 0; // To choose size using malloc_usable_size, we have to actually allocate. @@ -203,9 +217,32 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { return rv; } - // A deque avoids unnecessary copying of already-saved values - // and has near-minimal peak memory use. - std::deque hash_entries_; + // TODO: Ideally we want to verify the hash entry + // as it is added to the filter and eliminate this function + // for speeding up and leaving fewer spaces for undetected memory/CPU + // corruption. For Ribbon Filter, it's bit harder. + // Possible solution: + // pass a custom iterator that tracks the xor checksum as + // it iterates to ResetAndFindSeedToSolve + Status MaybeVerifyHashEntriesChecksum() { + if (!detect_filter_construct_corruption_) { + return Status::OK(); + } + + uint64_t actual_hash_entries_xor_checksum = 0; + for (uint64_t h : hash_entries_info_.entries) { + actual_hash_entries_xor_checksum ^= h; + } + + if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) { + return Status::OK(); + } else { + // Since these hash entries are corrupted and they will not be used + // anymore, we can reset them and release memory. + ResetEntries(); + return Status::Corruption("Filter's hash entries checksum mismatched"); + } + } // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, // always "round up" like historic behavior. @@ -214,17 +251,47 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // For reserving memory used in (new) Bloom and Ribbon Filter construction std::shared_ptr cache_res_mgr_; - // For managing cache reservation for buckets of hash entry in (new) Bloom and - // Ribbon Filter construction - std::deque>> - hash_entry_cache_res_bucket_handles_; - // For managing cache reservation for final filter in (new) Bloom and Ribbon // Filter construction std::deque>> final_filter_cache_res_handles_; + + bool detect_filter_construct_corruption_; + + struct HashEntriesInfo { + // A deque avoids unnecessary copying of already-saved values + // and has near-minimal peak memory use. + std::deque entries; + + // If cache_res_mgr_ != nullptr, + // it manages cache reservation for buckets of hash entries in (new) Bloom + // or Ribbon Filter construction. + // Otherwise, it is empty. + std::deque>> + cache_res_bucket_handles; + + // If detect_filter_construct_corruption_ == true, + // it records the xor checksum of hash entries. + // Otherwise, it is 0. + uint64_t xor_checksum = 0; + + void Swap(HashEntriesInfo* other) { + assert(other != nullptr); + std::swap(entries, other->entries); + std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); + std::swap(xor_checksum, other->xor_checksum); + } + + void Reset() { + entries.clear(); + cache_res_bucket_handles.clear(); + xor_checksum = 0; + } + }; + + HashEntriesInfo hash_entries_info_; }; // #################### FastLocalBloom implementation ################## // @@ -237,8 +304,10 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { explicit FastLocalBloomBitsBuilder( const int millibits_per_key, std::atomic* aggregate_rounding_balance, - std::shared_ptr cache_res_mgr) - : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr), + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr, + detect_filter_construct_corruption), millibits_per_key_(millibits_per_key) { assert(millibits_per_key >= 1000); } @@ -249,25 +318,29 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { ~FastLocalBloomBitsBuilder() override {} + using FilterBitsBuilder::Finish; + virtual Slice Finish(std::unique_ptr* buf) override { - size_t num_entries = hash_entries_.size(); + return Finish(buf, nullptr); + } + + virtual Slice Finish(std::unique_ptr* buf, + Status* status) override { + size_t num_entries = hash_entries_info_.entries.size(); size_t len_with_metadata = CalculateSpace(num_entries); std::unique_ptr mutable_buf; + std::unique_ptr> + final_filter_cache_res_handle; len_with_metadata = AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); // Cache reservation for mutable_buf if (cache_res_mgr_) { - std::unique_ptr< - CacheReservationHandle> - final_filter_cache_res_handle; Status s = cache_res_mgr_ ->MakeCacheReservation( len_with_metadata * sizeof(char), &final_filter_cache_res_handle); - final_filter_cache_res_handles_.push_back( - std::move(final_filter_cache_res_handle)); s.PermitUncheckedError(); } @@ -282,12 +355,25 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { uint32_t len = static_cast(len_with_metadata - kMetadataLen); if (len > 0) { + TEST_SYNC_POINT_CALLBACK( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries", + &hash_entries_info_.entries); AddAllEntries(mutable_buf.get(), len, num_probes); + Status verify_hash_entries_checksum_status = + MaybeVerifyHashEntriesChecksum(); + if (!verify_hash_entries_checksum_status.ok()) { + if (status) { + *status = verify_hash_entries_checksum_status; + } + return FinishAlwaysTrue(buf); + } } - assert(hash_entries_.empty()); - // Release cache for hash entries - hash_entry_cache_res_bucket_handles_.clear(); + bool keep_entries_for_postverify = detect_filter_construct_corruption_; + if (!keep_entries_for_postverify) { + ResetEntries(); + } // See BloomFilterPolicy::GetBloomBitsReader re: metadata // -1 = Marker for newer Bloom implementations @@ -298,8 +384,18 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { mutable_buf[len + 2] = static_cast(num_probes); // rest of metadata stays zero + auto TEST_arg_pair __attribute__((__unused__)) = + std::make_pair(&mutable_buf, len_with_metadata); + TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter", + &TEST_arg_pair); + Slice rv(mutable_buf.get(), len_with_metadata); *buf = std::move(mutable_buf); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + if (status) { + *status = Status::OK(); + } return rv; } @@ -366,12 +462,12 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { void AddAllEntries(char* data, uint32_t len, int num_probes) { // Simple version without prefetching: // - // for (auto h : hash_entries_) { + // for (auto h : hash_entries_info_.entries) { // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len, // num_probes, data); // } - const size_t num_entries = hash_entries_.size(); + const size_t num_entries = hash_entries_info_.entries.size(); constexpr size_t kBufferMask = 7; static_assert(((kBufferMask + 1) & kBufferMask) == 0, "Must be power of 2 minus 1"); @@ -381,12 +477,14 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { // Prime the buffer size_t i = 0; + std::deque::iterator hash_entries_it = + hash_entries_info_.entries.begin(); for (; i <= kBufferMask && i < num_entries; ++i) { - uint64_t h = hash_entries_.front(); - hash_entries_.pop_front(); + uint64_t h = *hash_entries_it; FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, /*out*/ &byte_offsets[i]); hashes[i] = Upper32of64(h); + ++hash_entries_it; } // Process and buffer @@ -397,11 +495,11 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes, data + byte_offset_ref); // And buffer - uint64_t h = hash_entries_.front(); - hash_entries_.pop_front(); + uint64_t h = *hash_entries_it; FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, /*out*/ &byte_offset_ref); hash_ref = Upper32of64(h); + ++hash_entries_it; } // Finish processing @@ -416,7 +514,7 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { }; // See description in FastLocalBloomImpl -class FastLocalBloomBitsReader : public FilterBitsReader { +class FastLocalBloomBitsReader : public BuiltinFilterBitsReader { public: FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes) : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {} @@ -451,6 +549,11 @@ class FastLocalBloomBitsReader : public FilterBitsReader { } } + bool HashMayMatch(const uint64_t h) override { + return FastLocalBloomImpl::HashMayMatch(Lower32of64(h), Upper32of64(h), + len_bytes_, num_probes_, data_); + } + private: const char* data_; const int num_probes_; @@ -486,12 +589,14 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { explicit Standard128RibbonBitsBuilder( double desired_one_in_fp_rate, int bloom_millibits_per_key, std::atomic* aggregate_rounding_balance, - std::shared_ptr cache_res_mgr, Logger* info_log) - : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr), + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption, Logger* info_log) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr, + detect_filter_construct_corruption), desired_one_in_fp_rate_(desired_one_in_fp_rate), info_log_(info_log), bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance, - cache_res_mgr) { + cache_res_mgr, detect_filter_construct_corruption) { assert(desired_one_in_fp_rate >= 1.0); } @@ -501,20 +606,32 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { ~Standard128RibbonBitsBuilder() override {} + using FilterBitsBuilder::Finish; + virtual Slice Finish(std::unique_ptr* buf) override { - if (hash_entries_.size() > kMaxRibbonEntries) { - ROCKS_LOG_WARN(info_log_, "Too many keys for Ribbon filter: %llu", - static_cast(hash_entries_.size())); + return Finish(buf, nullptr); + } + + virtual Slice Finish(std::unique_ptr* buf, + Status* status) override { + if (hash_entries_info_.entries.size() > kMaxRibbonEntries) { + ROCKS_LOG_WARN( + info_log_, "Too many keys for Ribbon filter: %llu", + static_cast(hash_entries_info_.entries.size())); SwapEntriesWith(&bloom_fallback_); - assert(hash_entries_.empty()); - return bloom_fallback_.Finish(buf); + assert(hash_entries_info_.entries.empty()); + return bloom_fallback_.Finish(buf, status); } - if (hash_entries_.size() == 0) { + if (hash_entries_info_.entries.size() == 0) { // Save a conditional in Ribbon queries by using alternate reader // for zero entries added. + if (status) { + *status = Status::OK(); + } return FinishAlwaysFalse(buf); } - uint32_t num_entries = static_cast(hash_entries_.size()); + uint32_t num_entries = + static_cast(hash_entries_info_.entries.size()); uint32_t num_slots; size_t len_with_metadata; @@ -523,13 +640,13 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { // Bloom fall-back indicator if (num_slots == 0) { SwapEntriesWith(&bloom_fallback_); - assert(hash_entries_.empty()); - return bloom_fallback_.Finish(buf); + assert(hash_entries_info_.entries.empty()); + return bloom_fallback_.Finish(buf, status); } uint32_t entropy = 0; - if (!hash_entries_.empty()) { - entropy = Lower32of64(hash_entries_.front()); + if (!hash_entries_info_.entries.empty()) { + entropy = Lower32of64(hash_entries_info_.entries.front()); } BandingType banding; @@ -552,46 +669,62 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { "Cache reservation for Ribbon filter banding failed due " "to cache full"); SwapEntriesWith(&bloom_fallback_); - assert(hash_entries_.empty()); + assert(hash_entries_info_.entries.empty()); // Release cache for banding since the banding won't be allocated banding_res_handle.reset(); - return bloom_fallback_.Finish(buf); + return bloom_fallback_.Finish(buf, status); } + TEST_SYNC_POINT_CALLBACK( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries", + &hash_entries_info_.entries); + bool success = banding.ResetAndFindSeedToSolve( - num_slots, hash_entries_.begin(), hash_entries_.end(), + num_slots, hash_entries_info_.entries.begin(), + hash_entries_info_.entries.end(), /*starting seed*/ entropy & 255, /*seed mask*/ 255); if (!success) { - ROCKS_LOG_WARN(info_log_, - "Too many re-seeds (256) for Ribbon filter, %llu / %llu", - static_cast(hash_entries_.size()), - static_cast(num_slots)); + ROCKS_LOG_WARN( + info_log_, "Too many re-seeds (256) for Ribbon filter, %llu / %llu", + static_cast(hash_entries_info_.entries.size()), + static_cast(num_slots)); SwapEntriesWith(&bloom_fallback_); - assert(hash_entries_.empty()); - return bloom_fallback_.Finish(buf); + assert(hash_entries_info_.entries.empty()); + return bloom_fallback_.Finish(buf, status); + } + + Status verify_hash_entries_checksum_status = + MaybeVerifyHashEntriesChecksum(); + if (!verify_hash_entries_checksum_status.ok()) { + ROCKS_LOG_WARN(info_log_, "Verify hash entries checksum error: %s", + verify_hash_entries_checksum_status.getState()); + if (status) { + *status = verify_hash_entries_checksum_status; + } + return FinishAlwaysTrue(buf); + } + + bool keep_entries_for_postverify = detect_filter_construct_corruption_; + if (!keep_entries_for_postverify) { + ResetEntries(); } - hash_entries_.clear(); - // Release cache for hash entries - hash_entry_cache_res_bucket_handles_.clear(); uint32_t seed = banding.GetOrdinalSeed(); assert(seed < 256); std::unique_ptr mutable_buf; + std::unique_ptr> + final_filter_cache_res_handle; len_with_metadata = AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); // Cache reservation for mutable_buf if (cache_res_mgr_) { - std::unique_ptr< - CacheReservationHandle> - final_filter_cache_res_handle; Status s = cache_res_mgr_ ->MakeCacheReservation( len_with_metadata * sizeof(char), &final_filter_cache_res_handle); - final_filter_cache_res_handles_.push_back( - std::move(final_filter_cache_res_handle)); s.PermitUncheckedError(); } @@ -619,8 +752,18 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { mutable_buf[len_with_metadata - 1] = static_cast((num_blocks >> 16) & 255); + auto TEST_arg_pair __attribute__((__unused__)) = + std::make_pair(&mutable_buf, len_with_metadata); + TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter", + &TEST_arg_pair); + Slice rv(mutable_buf.get(), len_with_metadata); *buf = std::move(mutable_buf); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + if (status) { + *status = Status::OK(); + } return rv; } @@ -637,8 +780,8 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { return; } uint32_t entropy = 0; - if (!hash_entries_.empty()) { - entropy = Upper32of64(hash_entries_.front()); + if (!hash_entries_info_.entries.empty()) { + entropy = Upper32of64(hash_entries_info_.entries.front()); } *num_slots = NumEntriesToNumSlots(static_cast(num_entries)); @@ -764,6 +907,12 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { return fake_soln.ExpectedFpRate(); } + Status MaybePostVerify(const Slice& filter_content) override { + bool fall_back = (bloom_fallback_.EstimateEntriesAdded() > 0); + return fall_back ? bloom_fallback_.MaybePostVerify(filter_content) + : XXPH3FilterBitsBuilder::MaybePostVerify(filter_content); + } + protected: size_t RoundDownUsableSpace(size_t available_size) override { size_t rv = available_size - kMetadataLen; @@ -808,7 +957,7 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { // for the linker, at least with DEBUG_LEVEL=2 constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries; -class Standard128RibbonBitsReader : public FilterBitsReader { +class Standard128RibbonBitsReader : public BuiltinFilterBitsReader { public: Standard128RibbonBitsReader(const char* data, size_t len_bytes, uint32_t num_blocks, uint32_t seed) @@ -848,6 +997,10 @@ class Standard128RibbonBitsReader : public FilterBitsReader { } } + bool HashMayMatch(const uint64_t h) override { + return soln_.FilterQuery(h, hasher_); + } + private: using TS = Standard128RibbonTypesAndSettings; ribbon::SerializableInterleavedSolution soln_; @@ -874,6 +1027,8 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { return hash_entries_.size(); } + using FilterBitsBuilder::Finish; + Slice Finish(std::unique_ptr* buf) override; size_t CalculateSpace(size_t num_entries) override { @@ -1055,7 +1210,7 @@ inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, folly::constexpr_log2(CACHE_LINE_SIZE)); } -class LegacyBloomBitsReader : public FilterBitsReader { +class LegacyBloomBitsReader : public BuiltinFilterBitsReader { public: LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines, uint32_t log2_cache_line_size) @@ -1100,6 +1255,8 @@ class LegacyBloomBitsReader : public FilterBitsReader { } } + bool HashMayMatch(const uint64_t /* h */) override { return false; } + private: const char* data_; const int num_probes_; @@ -1107,18 +1264,47 @@ class LegacyBloomBitsReader : public FilterBitsReader { const uint32_t log2_cache_line_size_; }; -class AlwaysTrueFilter : public FilterBitsReader { +class AlwaysTrueFilter : public BuiltinFilterBitsReader { public: bool MayMatch(const Slice&) override { return true; } using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return true; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload }; -class AlwaysFalseFilter : public FilterBitsReader { +class AlwaysFalseFilter : public BuiltinFilterBitsReader { public: bool MayMatch(const Slice&) override { return false; } using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return false; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload }; +Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { + Status s = Status::OK(); + + if (!detect_filter_construct_corruption_) { + return s; + } + + std::unique_ptr bits_reader( + BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content)); + + for (uint64_t h : hash_entries_info_.entries) { + // The current approach will not detect corruption from XXPH3Filter to + // AlwaysTrueFilter, which can lead to performance cost later due to + // AlwaysTrueFilter not filtering anything. But this cost is acceptable + // given the extra implementation complixity to detect such case. + bool may_match = bits_reader->HashMayMatch(h); + if (!may_match) { + s = Status::Corruption("Corrupted filter content"); + break; + } + } + + ResetEntries(); + return s; +} } // namespace const std::vector BloomFilterPolicy::kAllFixedImpls = { @@ -1268,7 +1454,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( case kFastLocalBloom: return new FastLocalBloomBitsBuilder( millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, - cache_res_mgr); + cache_res_mgr, + context.table_options.detect_filter_construct_corruption); case kLegacyBloom: if (whole_bits_per_key_ >= 14 && context.info_log && !warned_.load(std::memory_order_relaxed)) { @@ -1294,6 +1481,7 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( return new Standard128RibbonBitsBuilder( desired_one_in_fp_rate_, millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, cache_res_mgr, + context.table_options.detect_filter_construct_corruption, context.info_log); } } @@ -1310,10 +1498,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext( } } -// Read metadata to determine what kind of FilterBitsReader is needed -// and return a new one. -FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader( - const Slice& contents) const { +BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBuiltinFilterBitsReader( + const Slice& contents) { uint32_t len_with_meta = static_cast(contents.size()); if (len_with_meta <= kMetadataLen) { // filter is empty or broken. Treat like zero keys added. @@ -1393,8 +1579,15 @@ FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader( log2_cache_line_size); } -FilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader( +// Read metadata to determine what kind of FilterBitsReader is needed +// and return a new one. +FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader( const Slice& contents) const { + return BuiltinFilterPolicy::GetBuiltinFilterBitsReader(contents); +} + +BuiltinFilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader( + const Slice& contents) { uint32_t len_with_meta = static_cast(contents.size()); uint32_t len = len_with_meta - kMetadataLen; @@ -1417,8 +1610,8 @@ FilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader( } // For newer Bloom filter implementations -FilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader( - const Slice& contents) const { +BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader( + const Slice& contents) { uint32_t len_with_meta = static_cast(contents.size()); uint32_t len = len_with_meta - kMetadataLen; diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index fec1a6056..bccfc0cf6 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -38,10 +38,21 @@ class BuiltinFilterBitsBuilder : public FilterBitsBuilder { virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; }; +// Base class for RocksDB built-in filter reader with +// extra useful functionalities for inernal. +class BuiltinFilterBitsReader : public FilterBitsReader { + public: + // Check if the hash of the entry match the bits in filter + virtual bool HashMayMatch(const uint64_t /* h */) { return true; } +}; + // Abstract base class for RocksDB built-in filter policies. // This class is considered internal API and subject to change. class BuiltinFilterPolicy : public FilterPolicy { public: + static BuiltinFilterBitsReader* GetBuiltinFilterBitsReader( + const Slice& contents); + // Shared name because any built-in policy can read filters from // any other const char* Name() const override; @@ -60,10 +71,10 @@ class BuiltinFilterPolicy : public FilterPolicy { private: // For Bloom filter implementation(s) (except deprecated block-based filter) - FilterBitsReader* GetBloomBitsReader(const Slice& contents) const; + static BuiltinFilterBitsReader* GetBloomBitsReader(const Slice& contents); // For Ribbon filter implementation(s) - FilterBitsReader* GetRibbonBitsReader(const Slice& contents) const; + static BuiltinFilterBitsReader* GetRibbonBitsReader(const Slice& contents); }; // RocksDB built-in filter policy for Bloom or Bloom-like filters including diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index 9803acf99..6edf76f3e 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -109,8 +109,8 @@ Slice FullFilterBlockBuilder::Finish( *status = Status::OK(); if (any_added_) { any_added_ = false; - Slice filter_content = - filter_bits_builder_->Finish(filter_data ? filter_data : &filter_data_); + Slice filter_content = filter_bits_builder_->Finish( + filter_data ? filter_data : &filter_data_, status); return filter_content; } return Slice(); diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index bc3f958e0..0068e0f4d 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -63,6 +63,10 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { filter_bits_builder_.reset(); } + virtual Status MaybePostVerifyFilter(const Slice& filter_content) override { + return filter_bits_builder_->MaybePostVerify(filter_content); + } + protected: virtual void AddKey(const Slice& key); std::unique_ptr filter_bits_builder_; diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index b3563da3e..f919d1aa5 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -30,6 +30,8 @@ class TestFilterBitsBuilder : public FilterBitsBuilder { hash_entries_.push_back(Hash(key.data(), key.size(), 1)); } + using FilterBitsBuilder::Finish; + // Generate the filter using the keys that are added Slice Finish(std::unique_ptr* buf) override { uint32_t len = static_cast(hash_entries_.size()) * 4; @@ -221,8 +223,12 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { uniq_.insert(key.ToString()); } + using FilterBitsBuilder::Finish; + Slice Finish(std::unique_ptr* buf) override { Slice rv = b_->Finish(buf); + Status s_dont_care = b_->MaybePostVerify(rv); + s_dont_care.PermitUncheckedError(); uniq_.clear(); return rv; } diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index a2718b25d..d3cbe84fb 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -60,7 +60,9 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( } } -PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} +PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() { + partitioned_filters_construction_status_.PermitUncheckedError(); +} void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( const Slice* next_key) { @@ -88,9 +90,18 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded(); std::unique_ptr filter_data; - Slice filter = filter_bits_builder_->Finish(&filter_data); + Status filter_construction_status = Status::OK(); + Slice filter = + filter_bits_builder_->Finish(&filter_data, &filter_construction_status); + if (filter_construction_status.ok()) { + filter_construction_status = filter_bits_builder_->MaybePostVerify(filter); + } std::string& index_key = p_index_builder_->GetPartitionKey(); - filters.push_back({index_key, filter, std::move(filter_data)}); + filters.push_back({index_key, std::move(filter_data), filter}); + if (!filter_construction_status.ok() && + partitioned_filters_construction_status_.ok()) { + partitioned_filters_construction_status_ = filter_construction_status; + } keys_added_to_partition_ = 0; Reset(); } @@ -132,6 +143,12 @@ Slice PartitionedFilterBlockBuilder::Finish( } else { MaybeCutAFilterBlock(nullptr); } + + if (!partitioned_filters_construction_status_.ok()) { + *status = partitioned_filters_construction_status_; + return Slice(); + } + // If there is no filter partition left, then return the index on filter // partitions if (UNLIKELY(filters.empty())) { diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 94c6a9b95..555e940ea 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -41,6 +41,23 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { const BlockHandle& last_partition_block_handle, Status* status, std::unique_ptr* filter_data = nullptr) override; + virtual void ResetFilterBitsBuilder() override { + // Previously constructed partitioned filters by + // this to-be-reset FiterBitsBuilder can also be + // cleared + filters.clear(); + FullFilterBlockBuilder::ResetFilterBitsBuilder(); + } + + // For PartitionFilter, optional post-verifing the filter is done + // as part of PartitionFilterBlockBuilder::Finish + // to avoid implementation complexity of doing it elsewhere. + // Therefore we are skipping it in here. + virtual Status MaybePostVerifyFilter( + const Slice& /* filter_content */) override { + return Status::OK(); + } + private: // Filter data BlockBuilder index_on_filter_block_builder_; // top-level index builder @@ -48,11 +65,17 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { index_on_filter_block_builder_without_seq_; // same for user keys struct FilterEntry { std::string key; - Slice filter; std::unique_ptr filter_data; + Slice filter; }; std::deque filters; // list of partitioned filters and keys used // in building the index + + // Set to the first non-okay status if any of the filter + // partitions experiences construction error. + // If partitioned_filters_construction_status_ is non-okay, + // then the whole partitioned filters should not be used. + Status partitioned_filters_construction_status_; std::string last_filter_entry_key; std::unique_ptr last_filter_data; std::unique_ptr value; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 9d1f7c56a..db9978177 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -158,6 +158,7 @@ default_params = { "prepopulate_block_cache" : lambda: random.choice([0, 1]), "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]), "memtable_whole_key_filtering": lambda: random.randint(0, 1), + "detect_filter_construct_corruption": lambda: random.choice([0, 1]), } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' diff --git a/util/filter_bench.cc b/util/filter_bench.cc index 69a2a3b45..5cbad3970 100644 --- a/util/filter_bench.cc +++ b/util/filter_bench.cc @@ -94,6 +94,10 @@ DEFINE_bool(net_includes_hashing, false, DEFINE_bool(optimize_filters_for_memory, false, "Setting for BlockBasedTableOptions::optimize_filters_for_memory"); +DEFINE_bool(detect_filter_construct_corruption, false, + "Setting for " + "BlockBasedTableOptions::detect_filter_construct_corruption"); + DEFINE_uint32(block_cache_capacity_MB, 8, "Setting for " "LRUCacheOptions::capacity"); @@ -153,6 +157,7 @@ using ROCKSDB_NAMESPACE::PlainTableBloomV1; using ROCKSDB_NAMESPACE::Random32; using ROCKSDB_NAMESPACE::Slice; using ROCKSDB_NAMESPACE::static_cast_with_check; +using ROCKSDB_NAMESPACE::Status; using ROCKSDB_NAMESPACE::StderrLogger; using ROCKSDB_NAMESPACE::mock::MockBlockBasedTableTester; @@ -206,10 +211,13 @@ void PrintWarnings() { #endif } +void PrintError(const char *error) { fprintf(stderr, "ERROR: %s\n", error); } + struct FilterInfo { uint32_t filter_id_ = 0; std::unique_ptr owner_; Slice filter_; + Status filter_construction_status = Status::OK(); uint32_t keys_added_ = 0; std::unique_ptr reader_; std::unique_ptr full_block_reader_; @@ -300,6 +308,8 @@ struct FilterBench : public MockBlockBasedTableTester { ioptions_.logger = &stderr_logger_; table_options_.optimize_filters_for_memory = FLAGS_optimize_filters_for_memory; + table_options_.detect_filter_construct_corruption = + FLAGS_detect_filter_construct_corruption; if (FLAGS_reserve_table_builder_memory) { table_options_.reserve_table_builder_memory = true; table_options_.no_block_cache = false; @@ -419,7 +429,15 @@ void FilterBench::Go() { for (uint32_t i = 0; i < keys_to_add; ++i) { builder->AddKey(kms_[0].Get(filter_id, i)); } - info.filter_ = builder->Finish(&info.owner_); + info.filter_ = + builder->Finish(&info.owner_, &info.filter_construction_status); + if (info.filter_construction_status.ok()) { + info.filter_construction_status = + builder->MaybePostVerify(info.filter_); + } + if (!info.filter_construction_status.ok()) { + PrintError(info.filter_construction_status.ToString().c_str()); + } #ifdef PREDICT_FP_RATE weighted_predicted_fp_rate += keys_to_add *