You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/table/block_based/partitioned_filter_block.cc

563 lines
21 KiB

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "table/block_based/partitioned_filter_block.h"
#include <utility>
#include "file/random_access_file_reader.h"
#include "logging/logging.h"
#include "monitoring/perf_context_imp.h"
#include "port/malloc.h"
#include "port/port.h"
#include "rocksdb/filter_policy.h"
#include "table/block_based/block.h"
#include "table/block_based/block_based_table_reader.h"
#include "util/coding.h"
namespace ROCKSDB_NAMESPACE {
PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
const SliceTransform* _prefix_extractor, bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
const bool use_value_delta_encoding,
PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size)
: FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
filter_bits_builder),
index_on_filter_block_builder_(index_block_restart_interval,
true /*use_delta_encoding*/,
use_value_delta_encoding),
index_on_filter_block_builder_without_seq_(index_block_restart_interval,
true /*use_delta_encoding*/,
use_value_delta_encoding),
p_index_builder_(p_index_builder),
keys_added_to_partition_(0),
total_added_in_built_(0) {
Use size_t for filter APIs, protect against overflow (#7726) Summary: Deprecate CalculateNumEntry and replace with ApproximateNumEntries (better name) using size_t instead of int and uint32_t, to minimize confusing casts and bad overflow behavior (possible though probably not realistic). Bloom sizes are now explicitly capped at max size supported by implementations: just under 4GiB for fv=5 Bloom, and just under 512MiB for fv<5 Legacy Bloom. This hardening could help to set up for fuzzing. Also, since RocksDB only uses this information as an approximation for trying to hit certain sizes for partitioned filters, it's more important that the function be reasonably fast than for it to be completely accurate. It's hard enough to be 100% accurate for Ribbon (currently reversing CalculateSpace) that adding optimize_filters_for_memory into the mix is just not worth trying to be 100% accurate for num entries for bytes. Also: - Cleaned up filter_policy.h to remove MSVC warning handling and potentially unsafe use of exception for "not implemented" - Correct the number of entries limit beyond which current Ribbon implementation falls back on Bloom instead. - Consistently use "num_entries" rather than "num_entry" - Remove LegacyBloomBitsBuilder::CalculateNumEntry as it's essentially obsolete from general implementation BuiltinFilterBitsBuilder::CalculateNumEntries. - Fix filter_bench to skip some tests that don't make sense when only one or a small number of filters has been generated. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7726 Test Plan: expanded existing unit tests for CalculateSpace / ApproximateNumEntries. Also manually used filter_bench to verify Legacy and fv=5 Bloom size caps work (much too expensive for unit test). Note that the actual bits per key is below requested due to space cap. $ ./filter_bench -impl=0 -bits_per_key=20 -average_keys_per_filter=256000000 -vary_key_count_ratio=0 -m_keys_total_max=256 -allow_bad_fp_rate ... Total size (MB): 511.992 Bits/key stored: 16.777 ... $ ./filter_bench -impl=2 -bits_per_key=20 -average_keys_per_filter=2000000000 -vary_key_count_ratio=0 -m_keys_total_max=2000 ... Total size (MB): 4096 Bits/key stored: 17.1799 ... $ Reviewed By: jay-zhuang Differential Revision: D25239800 Pulled By: pdillinger fbshipit-source-id: f94e6d065efd31e05ec630ae1a82e6400d8390c4
4 years ago
keys_per_partition_ = static_cast<uint32_t>(
filter_bits_builder_->ApproximateNumEntries(partition_size));
if (keys_per_partition_ < 1) {
// partition_size (minus buffer, ~10%) might be smaller than minimum
// filter size, sometimes based on cache line size. Try to find that
// minimum size without CalculateSpace (not necessarily available).
uint32_t larger = std::max(partition_size + 4, uint32_t{16});
for (;;) {
Use size_t for filter APIs, protect against overflow (#7726) Summary: Deprecate CalculateNumEntry and replace with ApproximateNumEntries (better name) using size_t instead of int and uint32_t, to minimize confusing casts and bad overflow behavior (possible though probably not realistic). Bloom sizes are now explicitly capped at max size supported by implementations: just under 4GiB for fv=5 Bloom, and just under 512MiB for fv<5 Legacy Bloom. This hardening could help to set up for fuzzing. Also, since RocksDB only uses this information as an approximation for trying to hit certain sizes for partitioned filters, it's more important that the function be reasonably fast than for it to be completely accurate. It's hard enough to be 100% accurate for Ribbon (currently reversing CalculateSpace) that adding optimize_filters_for_memory into the mix is just not worth trying to be 100% accurate for num entries for bytes. Also: - Cleaned up filter_policy.h to remove MSVC warning handling and potentially unsafe use of exception for "not implemented" - Correct the number of entries limit beyond which current Ribbon implementation falls back on Bloom instead. - Consistently use "num_entries" rather than "num_entry" - Remove LegacyBloomBitsBuilder::CalculateNumEntry as it's essentially obsolete from general implementation BuiltinFilterBitsBuilder::CalculateNumEntries. - Fix filter_bench to skip some tests that don't make sense when only one or a small number of filters has been generated. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7726 Test Plan: expanded existing unit tests for CalculateSpace / ApproximateNumEntries. Also manually used filter_bench to verify Legacy and fv=5 Bloom size caps work (much too expensive for unit test). Note that the actual bits per key is below requested due to space cap. $ ./filter_bench -impl=0 -bits_per_key=20 -average_keys_per_filter=256000000 -vary_key_count_ratio=0 -m_keys_total_max=256 -allow_bad_fp_rate ... Total size (MB): 511.992 Bits/key stored: 16.777 ... $ ./filter_bench -impl=2 -bits_per_key=20 -average_keys_per_filter=2000000000 -vary_key_count_ratio=0 -m_keys_total_max=2000 ... Total size (MB): 4096 Bits/key stored: 17.1799 ... $ Reviewed By: jay-zhuang Differential Revision: D25239800 Pulled By: pdillinger fbshipit-source-id: f94e6d065efd31e05ec630ae1a82e6400d8390c4
4 years ago
keys_per_partition_ = static_cast<uint32_t>(
filter_bits_builder_->ApproximateNumEntries(larger));
if (keys_per_partition_ >= 1) {
break;
}
larger += larger / 4;
if (larger > 100000) {
// might be a broken implementation. substitute something reasonable:
// 1 key / byte.
keys_per_partition_ = partition_size;
break;
}
}
}
}
Detect (new) Bloom/Ribbon Filter construction corruption (#9342) Summary: Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393 **Context:** (Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following: a) set of keys to add to filter b) set of hashes to add to filter (64-bit hash applied to each key) c) set of Bloom indices to set in filter, with duplicates d) set of Bloom indices to set in filter, deduplicated e) final filter and its checksum This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level. - b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`) - c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO. Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default. **Summary:** - Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()` - Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption - See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design - Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries` - Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()` - When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342 Test Plan: - Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption` - Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl - For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break. - Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()` - FastLocalBloom - Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s** - After change: - `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)** - Standard128Ribbon - Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s** - After change: - `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)** - Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true` - Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status. Reviewed By: pdillinger Differential Revision: D33746928 Pulled By: hx235 fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
3 years ago
PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {
partitioned_filters_construction_status_.PermitUncheckedError();
}
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
const Slice* next_key) {
// Use == to send the request only once
if (keys_added_to_partition_ == keys_per_partition_) {
// Currently only index builder is in charge of cutting a partition. We keep
// requesting until it is granted.
p_index_builder_->RequestPartitionCut();
}
if (!p_index_builder_->ShouldCutFilterBlock()) {
return;
}
Fix a bug for SeekForPrev with partitioned filter and prefix (#8137) Summary: According to https://github.com/facebook/rocksdb/issues/5907, each filter partition "should include the bloom of the prefix of the last key in the previous partition" so that SeekForPrev() in prefix mode can return correct result. The prefix of the last key in the previous partition does not necessarily have the same prefix as the first key in the current partition. Regardless of the first key in current partition, the prefix of the last key in the previous partition should be added. The existing code, however, does not follow this. Furthermore, there is another issue: when finishing current filter partition, `FullFilterBlockBuilder::AddPrefix()` is called for the first key in next filter partition, which effectively overwrites `last_prefix_str_` prematurely. Consequently, when the filter block builder proceeds to the next partition, `last_prefix_str_` will be the prefix of its first key, leaving no way of adding the bloom of the prefix of the last key of the previous partition. Prefix extractor is FixedLength.2. ``` [ filter part 1 ] [ filter part 2 ] abc d ``` When SeekForPrev("abcd"), checking the filter partition will land on filter part 2 because "abcd" > "abc" but smaller than "d". If the filter in filter part 2 happens to return false for the test for "ab", then SeekForPrev("abcd") will build incorrect iterator tree in non-total-order mode. Also fix a unit test which starts to fail following this PR. `InDomain` should not fail due to assertion error when checking on an arbitrary key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8137 Test Plan: ``` make check ``` Without this fix, the following command will fail pretty soon. ``` ./db_stress --acquire_snapshot_one_in=10000 --avoid_flush_during_recovery=0 \ --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 \ --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=17 \ --bottommost_compression_type=disable --cache_index_and_filter_blocks=1 --cache_size=1048576 \ --checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 \ --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_ttl=0 \ --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 \ --compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 \ --continuous_verification_interval=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox \ --db_write_buffer_size=8388608 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --enable_blob_files=0 \ --enable_compaction_filter=0 --enable_pipelined_write=1 --file_checksum_impl=big --flush_one_in=1000000 \ --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 \ --get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --index_type=2 --ingest_external_file_one_in=0 \ --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True \ --log2_keys_per_lock=10 --long_running_snapshots=1 --mark_for_compaction_one_file_in=0 \ --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000000 --max_key_len=3 \ --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 \ --max_write_buffer_size_to_maintain=8388608 --memtablerep=skip_list --mmap_read=1 --mock_direct_io=False \ --nooverwritepercent=0 --open_files=500000 --ops_per_thread=20000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=1 --partition_pinning=0 --pause_background_one_in=1000000 \ --periodic_compaction_seconds=0 --prefixpercent=5 --progress_reports=0 --read_fault_one_in=0 --read_only=0 \ --readpercent=45 --recycle_log_file_num=0 --reopen=20 --secondary_catch_up_one_in=0 \ --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 \ --sst_file_manager_bytes_per_truncate=0 --subcompactions=2 --sync=0 --sync_fault_injection=False \ --target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=0 \ --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_blob_db=0 --use_block_based_filter=0 \ --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 \ --use_multiget=0 --use_ribbon_filter=0 --use_txn=0 --user_timestamp_size=8 --verify_checksum=1 \ --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --write_buffer_size=4194304 \ --write_dbid_to_manifest=1 --writepercent=35 ``` Reviewed By: pdillinger Differential Revision: D27553054 Pulled By: riversand963 fbshipit-source-id: 60e391e4a2d8d98a9a3172ec5d6176b90ec3de98
4 years ago
// Add the prefix of the next key before finishing the partition without
// updating last_prefix_str_. This hack, fixes a bug with format_verison=3
// where seeking for the prefix would lead us to the previous partition.
const bool maybe_add_prefix =
next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
Fix a bug for SeekForPrev with partitioned filter and prefix (#8137) Summary: According to https://github.com/facebook/rocksdb/issues/5907, each filter partition "should include the bloom of the prefix of the last key in the previous partition" so that SeekForPrev() in prefix mode can return correct result. The prefix of the last key in the previous partition does not necessarily have the same prefix as the first key in the current partition. Regardless of the first key in current partition, the prefix of the last key in the previous partition should be added. The existing code, however, does not follow this. Furthermore, there is another issue: when finishing current filter partition, `FullFilterBlockBuilder::AddPrefix()` is called for the first key in next filter partition, which effectively overwrites `last_prefix_str_` prematurely. Consequently, when the filter block builder proceeds to the next partition, `last_prefix_str_` will be the prefix of its first key, leaving no way of adding the bloom of the prefix of the last key of the previous partition. Prefix extractor is FixedLength.2. ``` [ filter part 1 ] [ filter part 2 ] abc d ``` When SeekForPrev("abcd"), checking the filter partition will land on filter part 2 because "abcd" > "abc" but smaller than "d". If the filter in filter part 2 happens to return false for the test for "ab", then SeekForPrev("abcd") will build incorrect iterator tree in non-total-order mode. Also fix a unit test which starts to fail following this PR. `InDomain` should not fail due to assertion error when checking on an arbitrary key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8137 Test Plan: ``` make check ``` Without this fix, the following command will fail pretty soon. ``` ./db_stress --acquire_snapshot_one_in=10000 --avoid_flush_during_recovery=0 \ --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 \ --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=17 \ --bottommost_compression_type=disable --cache_index_and_filter_blocks=1 --cache_size=1048576 \ --checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 \ --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_ttl=0 \ --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 \ --compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 \ --continuous_verification_interval=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox \ --db_write_buffer_size=8388608 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --enable_blob_files=0 \ --enable_compaction_filter=0 --enable_pipelined_write=1 --file_checksum_impl=big --flush_one_in=1000000 \ --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 \ --get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --index_type=2 --ingest_external_file_one_in=0 \ --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True \ --log2_keys_per_lock=10 --long_running_snapshots=1 --mark_for_compaction_one_file_in=0 \ --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000000 --max_key_len=3 \ --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 \ --max_write_buffer_size_to_maintain=8388608 --memtablerep=skip_list --mmap_read=1 --mock_direct_io=False \ --nooverwritepercent=0 --open_files=500000 --ops_per_thread=20000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=1 --partition_pinning=0 --pause_background_one_in=1000000 \ --periodic_compaction_seconds=0 --prefixpercent=5 --progress_reports=0 --read_fault_one_in=0 --read_only=0 \ --readpercent=45 --recycle_log_file_num=0 --reopen=20 --secondary_catch_up_one_in=0 \ --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 \ --sst_file_manager_bytes_per_truncate=0 --subcompactions=2 --sync=0 --sync_fault_injection=False \ --target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=0 \ --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_blob_db=0 --use_block_based_filter=0 \ --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 \ --use_multiget=0 --use_ribbon_filter=0 --use_txn=0 --user_timestamp_size=8 --verify_checksum=1 \ --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --write_buffer_size=4194304 \ --write_dbid_to_manifest=1 --writepercent=35 ``` Reviewed By: pdillinger Differential Revision: D27553054 Pulled By: riversand963 fbshipit-source-id: 60e391e4a2d8d98a9a3172ec5d6176b90ec3de98
4 years ago
if (maybe_add_prefix) {
const Slice next_key_prefix = prefix_extractor()->Transform(*next_key);
if (next_key_prefix.compare(last_prefix_str()) != 0) {
AddKey(next_key_prefix);
}
}
total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded();
Deallocate payload of BlockBasedTableBuilder::Rep::FilterBlockBuilder earlier for Full/PartitionedFilter (#9070) Summary: Note: This PR is the 1st part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073). Context: Previously, the payload (i.e, filter data) within `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object is not deallocated until `BlockBasedTableBuilder` is deallocated, despite it is no longer useful after its related `filter_content` being written. - Transferred the payload (i.e, the filter data) out of `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object - For PartitionedFilter: - Unified `filters` and `filter_gc` lists into one `std::deque<FilterEntry> filters` by adding a new field `last_filter_entry_key` and storing the `std::unique_ptr filter_data` with the `Slice filter` in the same entry - Reset `last_filter_data` in the case where `filters` is empty, which should be as by then we would've finish using all the `Slice filter` - Deallocated the payload by going out of scope as soon as we're done with using the `filter_content` associated with the payload - This is an internal interface change at the level of `FilterBlockBuilder::Finish()`, which leads to touching the inherited interface in `BlockBasedFilterBlockBuilder`. But for that, the payload transferring is ignored. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9070 Test Plan: - The main focus is to catch segment fault error during `FilterBlockBuilder::Finish()` and `BlockBasedTableBuilder::Finish()` and interface mismatch. Relying on existing CI tests is enough as `assert(false)` was temporarily added to verify the new logic of transferring ownership indeed run Reviewed By: pdillinger Differential Revision: D31884933 Pulled By: hx235 fbshipit-source-id: f73ecfbea13788d4fc058013ace27230110b52f4
3 years ago
std::unique_ptr<const char[]> filter_data;
Detect (new) Bloom/Ribbon Filter construction corruption (#9342) Summary: Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393 **Context:** (Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following: a) set of keys to add to filter b) set of hashes to add to filter (64-bit hash applied to each key) c) set of Bloom indices to set in filter, with duplicates d) set of Bloom indices to set in filter, deduplicated e) final filter and its checksum This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level. - b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`) - c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO. Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default. **Summary:** - Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()` - Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption - See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design - Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries` - Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()` - When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342 Test Plan: - Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption` - Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl - For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break. - Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()` - FastLocalBloom - Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s** - After change: - `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)** - Standard128Ribbon - Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s** - After change: - `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)** - Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true` - Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status. Reviewed By: pdillinger Differential Revision: D33746928 Pulled By: hx235 fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
3 years ago
Status filter_construction_status = Status::OK();
Slice filter =
filter_bits_builder_->Finish(&filter_data, &filter_construction_status);
if (filter_construction_status.ok()) {
filter_construction_status = filter_bits_builder_->MaybePostVerify(filter);
}
std::string& index_key = p_index_builder_->GetPartitionKey();
Detect (new) Bloom/Ribbon Filter construction corruption (#9342) Summary: Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393 **Context:** (Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following: a) set of keys to add to filter b) set of hashes to add to filter (64-bit hash applied to each key) c) set of Bloom indices to set in filter, with duplicates d) set of Bloom indices to set in filter, deduplicated e) final filter and its checksum This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level. - b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`) - c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO. Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default. **Summary:** - Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()` - Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption - See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design - Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries` - Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()` - When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342 Test Plan: - Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption` - Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl - For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break. - Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()` - FastLocalBloom - Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s** - After change: - `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)** - Standard128Ribbon - Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s** - After change: - `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)** - Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true` - Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status. Reviewed By: pdillinger Differential Revision: D33746928 Pulled By: hx235 fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
3 years ago
filters.push_back({index_key, std::move(filter_data), filter});
if (!filter_construction_status.ok() &&
partitioned_filters_construction_status_.ok()) {
partitioned_filters_construction_status_ = filter_construction_status;
}
keys_added_to_partition_ = 0;
Reset();
}
void PartitionedFilterBlockBuilder::Add(const Slice& key) {
MaybeCutAFilterBlock(&key);
FullFilterBlockBuilder::Add(key);
}
void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
FullFilterBlockBuilder::AddKey(key);
keys_added_to_partition_++;
}
size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
}
Slice PartitionedFilterBlockBuilder::Finish(
Deallocate payload of BlockBasedTableBuilder::Rep::FilterBlockBuilder earlier for Full/PartitionedFilter (#9070) Summary: Note: This PR is the 1st part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073). Context: Previously, the payload (i.e, filter data) within `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object is not deallocated until `BlockBasedTableBuilder` is deallocated, despite it is no longer useful after its related `filter_content` being written. - Transferred the payload (i.e, the filter data) out of `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object - For PartitionedFilter: - Unified `filters` and `filter_gc` lists into one `std::deque<FilterEntry> filters` by adding a new field `last_filter_entry_key` and storing the `std::unique_ptr filter_data` with the `Slice filter` in the same entry - Reset `last_filter_data` in the case where `filters` is empty, which should be as by then we would've finish using all the `Slice filter` - Deallocated the payload by going out of scope as soon as we're done with using the `filter_content` associated with the payload - This is an internal interface change at the level of `FilterBlockBuilder::Finish()`, which leads to touching the inherited interface in `BlockBasedFilterBlockBuilder`. But for that, the payload transferring is ignored. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9070 Test Plan: - The main focus is to catch segment fault error during `FilterBlockBuilder::Finish()` and `BlockBasedTableBuilder::Finish()` and interface mismatch. Relying on existing CI tests is enough as `assert(false)` was temporarily added to verify the new logic of transferring ownership indeed run Reviewed By: pdillinger Differential Revision: D31884933 Pulled By: hx235 fbshipit-source-id: f73ecfbea13788d4fc058013ace27230110b52f4
3 years ago
const BlockHandle& last_partition_block_handle, Status* status,
std::unique_ptr<const char[]>* filter_data) {
if (finishing_filters == true) {
// Record the handle of the last written filter block in the index
std::string handle_encoding;
last_partition_block_handle.EncodeTo(&handle_encoding);
std::string handle_delta_encoding;
PutVarsignedint64(
&handle_delta_encoding,
last_partition_block_handle.size() - last_encoded_handle_.size());
last_encoded_handle_ = last_partition_block_handle;
const Slice handle_delta_encoding_slice(handle_delta_encoding);
Deallocate payload of BlockBasedTableBuilder::Rep::FilterBlockBuilder earlier for Full/PartitionedFilter (#9070) Summary: Note: This PR is the 1st part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073). Context: Previously, the payload (i.e, filter data) within `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object is not deallocated until `BlockBasedTableBuilder` is deallocated, despite it is no longer useful after its related `filter_content` being written. - Transferred the payload (i.e, the filter data) out of `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object - For PartitionedFilter: - Unified `filters` and `filter_gc` lists into one `std::deque<FilterEntry> filters` by adding a new field `last_filter_entry_key` and storing the `std::unique_ptr filter_data` with the `Slice filter` in the same entry - Reset `last_filter_data` in the case where `filters` is empty, which should be as by then we would've finish using all the `Slice filter` - Deallocated the payload by going out of scope as soon as we're done with using the `filter_content` associated with the payload - This is an internal interface change at the level of `FilterBlockBuilder::Finish()`, which leads to touching the inherited interface in `BlockBasedFilterBlockBuilder`. But for that, the payload transferring is ignored. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9070 Test Plan: - The main focus is to catch segment fault error during `FilterBlockBuilder::Finish()` and `BlockBasedTableBuilder::Finish()` and interface mismatch. Relying on existing CI tests is enough as `assert(false)` was temporarily added to verify the new logic of transferring ownership indeed run Reviewed By: pdillinger Differential Revision: D31884933 Pulled By: hx235 fbshipit-source-id: f73ecfbea13788d4fc058013ace27230110b52f4
3 years ago
index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding,
&handle_delta_encoding_slice);
if (!p_index_builder_->seperator_is_key_plus_seq()) {
index_on_filter_block_builder_without_seq_.Add(
Deallocate payload of BlockBasedTableBuilder::Rep::FilterBlockBuilder earlier for Full/PartitionedFilter (#9070) Summary: Note: This PR is the 1st part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073). Context: Previously, the payload (i.e, filter data) within `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object is not deallocated until `BlockBasedTableBuilder` is deallocated, despite it is no longer useful after its related `filter_content` being written. - Transferred the payload (i.e, the filter data) out of `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object - For PartitionedFilter: - Unified `filters` and `filter_gc` lists into one `std::deque<FilterEntry> filters` by adding a new field `last_filter_entry_key` and storing the `std::unique_ptr filter_data` with the `Slice filter` in the same entry - Reset `last_filter_data` in the case where `filters` is empty, which should be as by then we would've finish using all the `Slice filter` - Deallocated the payload by going out of scope as soon as we're done with using the `filter_content` associated with the payload - This is an internal interface change at the level of `FilterBlockBuilder::Finish()`, which leads to touching the inherited interface in `BlockBasedFilterBlockBuilder`. But for that, the payload transferring is ignored. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9070 Test Plan: - The main focus is to catch segment fault error during `FilterBlockBuilder::Finish()` and `BlockBasedTableBuilder::Finish()` and interface mismatch. Relying on existing CI tests is enough as `assert(false)` was temporarily added to verify the new logic of transferring ownership indeed run Reviewed By: pdillinger Differential Revision: D31884933 Pulled By: hx235 fbshipit-source-id: f73ecfbea13788d4fc058013ace27230110b52f4
3 years ago
ExtractUserKey(last_filter_entry_key), handle_encoding,
&handle_delta_encoding_slice);
}
} else {
MaybeCutAFilterBlock(nullptr);
}
Detect (new) Bloom/Ribbon Filter construction corruption (#9342) Summary: Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393 **Context:** (Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following: a) set of keys to add to filter b) set of hashes to add to filter (64-bit hash applied to each key) c) set of Bloom indices to set in filter, with duplicates d) set of Bloom indices to set in filter, deduplicated e) final filter and its checksum This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level. - b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`) - c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO. Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default. **Summary:** - Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()` - Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption - See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design - Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries` - Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()` - When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342 Test Plan: - Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption` - Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl - For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break. - Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()` - FastLocalBloom - Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s** - After change: - `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)** - `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)** - Standard128Ribbon - Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s** - After change: - `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)** - `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)** - Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true` - Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status. Reviewed By: pdillinger Differential Revision: D33746928 Pulled By: hx235 fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
3 years ago
if (!partitioned_filters_construction_status_.ok()) {
*status = partitioned_filters_construction_status_;
return Slice();
}
// If there is no filter partition left, then return the index on filter
// partitions
if (UNLIKELY(filters.empty())) {
*status = Status::OK();
Deallocate payload of BlockBasedTableBuilder::Rep::FilterBlockBuilder earlier for Full/PartitionedFilter (#9070) Summary: Note: This PR is the 1st part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073). Context: Previously, the payload (i.e, filter data) within `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object is not deallocated until `BlockBasedTableBuilder` is deallocated, despite it is no longer useful after its related `filter_content` being written. - Transferred the payload (i.e, the filter data) out of `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object - For PartitionedFilter: - Unified `filters` and `filter_gc` lists into one `std::deque<FilterEntry> filters` by adding a new field `last_filter_entry_key` and storing the `std::unique_ptr filter_data` with the `Slice filter` in the same entry - Reset `last_filter_data` in the case where `filters` is empty, which should be as by then we would've finish using all the `Slice filter` - Deallocated the payload by going out of scope as soon as we're done with using the `filter_content` associated with the payload - This is an internal interface change at the level of `FilterBlockBuilder::Finish()`, which leads to touching the inherited interface in `BlockBasedFilterBlockBuilder`. But for that, the payload transferring is ignored. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9070 Test Plan: - The main focus is to catch segment fault error during `FilterBlockBuilder::Finish()` and `BlockBasedTableBuilder::Finish()` and interface mismatch. Relying on existing CI tests is enough as `assert(false)` was temporarily added to verify the new logic of transferring ownership indeed run Reviewed By: pdillinger Differential Revision: D31884933 Pulled By: hx235 fbshipit-source-id: f73ecfbea13788d4fc058013ace27230110b52f4
3 years ago
last_filter_data.reset();
if (finishing_filters) {
// Simplest to just add them all at the end
total_added_in_built_ = 0;
if (p_index_builder_->seperator_is_key_plus_seq()) {
return index_on_filter_block_builder_.Finish();
} else {
return index_on_filter_block_builder_without_seq_.Finish();
}
} else {
// This is the rare case where no key was added to the filter
return Slice();
}
} else {
// Return the next filter partition in line and set Incomplete() status to
// indicate we expect more calls to Finish
*status = Status::Incomplete();
finishing_filters = true;
Deallocate payload of BlockBasedTableBuilder::Rep::FilterBlockBuilder earlier for Full/PartitionedFilter (#9070) Summary: Note: This PR is the 1st part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073). Context: Previously, the payload (i.e, filter data) within `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object is not deallocated until `BlockBasedTableBuilder` is deallocated, despite it is no longer useful after its related `filter_content` being written. - Transferred the payload (i.e, the filter data) out of `BlockBasedTableBuilder::Rep::FilterBlockBuilder` object - For PartitionedFilter: - Unified `filters` and `filter_gc` lists into one `std::deque<FilterEntry> filters` by adding a new field `last_filter_entry_key` and storing the `std::unique_ptr filter_data` with the `Slice filter` in the same entry - Reset `last_filter_data` in the case where `filters` is empty, which should be as by then we would've finish using all the `Slice filter` - Deallocated the payload by going out of scope as soon as we're done with using the `filter_content` associated with the payload - This is an internal interface change at the level of `FilterBlockBuilder::Finish()`, which leads to touching the inherited interface in `BlockBasedFilterBlockBuilder`. But for that, the payload transferring is ignored. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9070 Test Plan: - The main focus is to catch segment fault error during `FilterBlockBuilder::Finish()` and `BlockBasedTableBuilder::Finish()` and interface mismatch. Relying on existing CI tests is enough as `assert(false)` was temporarily added to verify the new logic of transferring ownership indeed run Reviewed By: pdillinger Differential Revision: D31884933 Pulled By: hx235 fbshipit-source-id: f73ecfbea13788d4fc058013ace27230110b52f4
3 years ago
last_filter_entry_key = filters.front().key;
Slice filter = filters.front().filter;
last_filter_data = std::move(filters.front().filter_data);
if (filter_data != nullptr) {
*filter_data = std::move(last_filter_data);
}
filters.pop_front();
return filter;
}
}
PartitionedFilterBlockReader::PartitionedFilterBlockReader(
const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
: FilterBlockReaderCommon(t, std::move(filter_block)) {}
std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
const BlockBasedTable* table, const ReadOptions& ro,
FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
bool pin, BlockCacheLookupContext* lookup_context) {
assert(table);
assert(table->get_rep());
assert(!pin || prefetch);
CachableEntry<Block> filter_block;
if (prefetch || !use_cache) {
const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
nullptr /* get_context */, lookup_context,
&filter_block);
if (!s.ok()) {
IGNORE_STATUS_IF_ERROR(s);
return std::unique_ptr<FilterBlockReader>();
}
if (use_cache && !pin) {
filter_block.Reset();
}
}
return std::unique_ptr<FilterBlockReader>(
new PartitionedFilterBlockReader(table, std::move(filter_block)));
}
bool PartitionedFilterBlockReader::KeyMayMatch(
const Slice& key, const SliceTransform* prefix_extractor,
Create a BlockCacheLookupContext to enable fine-grained block cache tracing. (#5421) Summary: BlockCacheLookupContext only contains the caller for now. We will trace block accesses at five places: 1. BlockBasedTable::GetFilter. 2. BlockBasedTable::GetUncompressedDict. 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, and range deletion block.) 4. BlockBasedTable::Get. (To trace the referenced key and whether the referenced key exists in a fetched data block.) 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the referenced key exists in a fetched data block.) We create the context at: 1. BlockBasedTable::Get. (kUserGet) 2. BlockBasedTable::MultiGet. (kUserMGet) 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or external SST ingestion calls this function.) 4. BlockBasedTable::Open. (kPrefetch) 5. Index/Filter::CacheDependencies. (kPrefetch) 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or kUserApproximateSize). I loaded 1 million key-value pairs into the database and ran the readrandom benchmark with a single thread. I gave the block cache 10 GB to make sure all reads hit the block cache after warmup. The throughput is comparable. Throughput of this PR: 231334 ops/s. Throughput of the master branch: 238428 ops/s. Experiment setup: RocksDB: version 6.2 Date: Mon Jun 10 10:42:51 2019 CPU: 24 * Intel Core Processor (Skylake) CPUCache: 16384 KB Keys: 20 bytes each Values: 100 bytes each (100 bytes after compression) Entries: 1000000 Prefix: 20 bytes Keys per prefix: 0 RawSize: 114.4 MB (estimated) FileSize: 114.4 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: NoCompression Compression sampling rate: 0 Memtablerep: skip_list Perf Level: 1 Load command: ./db_bench --benchmarks="fillseq" --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 Run command: ./db_bench --benchmarks="readrandom,stats" --use_existing_db --threads=1 --duration=120 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 --duration=120 TODOs: 1. Create a caller for external SST file ingestion and differentiate the callers for iterator. 2. Integrate tracer to trace block cache accesses. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5421 Differential Revision: D15704258 Pulled By: HaoyuHuang fbshipit-source-id: 4aa8a55f8cb1576ffb367bfa3186a91d8f06d93a
5 years ago
uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
GetContext* get_context, BlockCacheLookupContext* lookup_context) {
assert(const_ikey_ptr != nullptr);
assert(block_offset == kNotValid);
if (!whole_key_filtering()) {
return true;
}
return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr,
get_context, lookup_context,
&FullFilterBlockReader::KeyMayMatch);
}
Basic MultiGet support for partitioned filters (#6757) Summary: In MultiGet, access each applicable filter partition only once per batch, rather than for each applicable key. Also, * Fix Bloom stats for MultiGet * Fix/refactor MultiGetContext::Range::KeysLeft, including * Add efficient BitsSetToOne implementation * Assert that MultiGetContext::Range does not go beyond shift range Performance test: Generate db: $ ./db_bench --benchmarks=fillrandom --num=15000000 --cache_index_and_filter_blocks -bloom_bits=10 -partition_index_and_filters=true ... Before (middle performing run of three; note some missing Bloom stats): $ ./db_bench --use-existing-db --benchmarks=multireadrandom --num=15000000 --cache_index_and_filter_blocks --bloom_bits=10 --threads=16 --cache_size=20000000 -partition_index_and_filters -batch_size=32 -multiread_batched -statistics --duration=20 2>&1 | egrep 'micros/op|block.cache.filter.hit|bloom.filter.(full|use)|number.multiget' multireadrandom : 26.403 micros/op 597517 ops/sec; (548427 of 671968 found) rocksdb.block.cache.filter.hit COUNT : 83443275 rocksdb.bloom.filter.useful COUNT : 0 rocksdb.bloom.filter.full.positive COUNT : 0 rocksdb.bloom.filter.full.true.positive COUNT : 7931450 rocksdb.number.multiget.get COUNT : 385984 rocksdb.number.multiget.keys.read COUNT : 12351488 rocksdb.number.multiget.bytes.read COUNT : 793145000 rocksdb.number.multiget.keys.found COUNT : 7931450 After (middle performing run of three): $ ./db_bench_new --use-existing-db --benchmarks=multireadrandom --num=15000000 --cache_index_and_filter_blocks --bloom_bits=10 --threads=16 --cache_size=20000000 -partition_index_and_filters -batch_size=32 -multiread_batched -statistics --duration=20 2>&1 | egrep 'micros/op|block.cache.filter.hit|bloom.filter.(full|use)|number.multiget' multireadrandom : 21.024 micros/op 752963 ops/sec; (705188 of 863968 found) rocksdb.block.cache.filter.hit COUNT : 49856682 rocksdb.bloom.filter.useful COUNT : 45684579 rocksdb.bloom.filter.full.positive COUNT : 10395458 rocksdb.bloom.filter.full.true.positive COUNT : 9908456 rocksdb.number.multiget.get COUNT : 481984 rocksdb.number.multiget.keys.read COUNT : 15423488 rocksdb.number.multiget.bytes.read COUNT : 990845600 rocksdb.number.multiget.keys.found COUNT : 9908456 So that's about 25% higher throughput even for random keys Pull Request resolved: https://github.com/facebook/rocksdb/pull/6757 Test Plan: unit test included Reviewed By: anand1976 Differential Revision: D21243256 Pulled By: pdillinger fbshipit-source-id: 5644a1468d9e8c8575be02f4e04bc5d62dbbb57f
4 years ago
void PartitionedFilterBlockReader::KeysMayMatch(
MultiGetRange* range, const SliceTransform* prefix_extractor,
uint64_t block_offset, const bool no_io,
BlockCacheLookupContext* lookup_context) {
assert(block_offset == kNotValid);
if (!whole_key_filtering()) {
return; // Any/all may match
}
MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context,
&FullFilterBlockReader::KeysMayMatch);
}
bool PartitionedFilterBlockReader::PrefixMayMatch(
const Slice& prefix, const SliceTransform* prefix_extractor,
Create a BlockCacheLookupContext to enable fine-grained block cache tracing. (#5421) Summary: BlockCacheLookupContext only contains the caller for now. We will trace block accesses at five places: 1. BlockBasedTable::GetFilter. 2. BlockBasedTable::GetUncompressedDict. 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, and range deletion block.) 4. BlockBasedTable::Get. (To trace the referenced key and whether the referenced key exists in a fetched data block.) 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the referenced key exists in a fetched data block.) We create the context at: 1. BlockBasedTable::Get. (kUserGet) 2. BlockBasedTable::MultiGet. (kUserMGet) 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or external SST ingestion calls this function.) 4. BlockBasedTable::Open. (kPrefetch) 5. Index/Filter::CacheDependencies. (kPrefetch) 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or kUserApproximateSize). I loaded 1 million key-value pairs into the database and ran the readrandom benchmark with a single thread. I gave the block cache 10 GB to make sure all reads hit the block cache after warmup. The throughput is comparable. Throughput of this PR: 231334 ops/s. Throughput of the master branch: 238428 ops/s. Experiment setup: RocksDB: version 6.2 Date: Mon Jun 10 10:42:51 2019 CPU: 24 * Intel Core Processor (Skylake) CPUCache: 16384 KB Keys: 20 bytes each Values: 100 bytes each (100 bytes after compression) Entries: 1000000 Prefix: 20 bytes Keys per prefix: 0 RawSize: 114.4 MB (estimated) FileSize: 114.4 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: NoCompression Compression sampling rate: 0 Memtablerep: skip_list Perf Level: 1 Load command: ./db_bench --benchmarks="fillseq" --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 Run command: ./db_bench --benchmarks="readrandom,stats" --use_existing_db --threads=1 --duration=120 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 --duration=120 TODOs: 1. Create a caller for external SST file ingestion and differentiate the callers for iterator. 2. Integrate tracer to trace block cache accesses. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5421 Differential Revision: D15704258 Pulled By: HaoyuHuang fbshipit-source-id: 4aa8a55f8cb1576ffb367bfa3186a91d8f06d93a
5 years ago
uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
GetContext* get_context, BlockCacheLookupContext* lookup_context) {
assert(const_ikey_ptr != nullptr);
assert(block_offset == kNotValid);
if (!table_prefix_extractor() && !prefix_extractor) {
return true;
}
return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr,
get_context, lookup_context,
&FullFilterBlockReader::PrefixMayMatch);
}
Basic MultiGet support for partitioned filters (#6757) Summary: In MultiGet, access each applicable filter partition only once per batch, rather than for each applicable key. Also, * Fix Bloom stats for MultiGet * Fix/refactor MultiGetContext::Range::KeysLeft, including * Add efficient BitsSetToOne implementation * Assert that MultiGetContext::Range does not go beyond shift range Performance test: Generate db: $ ./db_bench --benchmarks=fillrandom --num=15000000 --cache_index_and_filter_blocks -bloom_bits=10 -partition_index_and_filters=true ... Before (middle performing run of three; note some missing Bloom stats): $ ./db_bench --use-existing-db --benchmarks=multireadrandom --num=15000000 --cache_index_and_filter_blocks --bloom_bits=10 --threads=16 --cache_size=20000000 -partition_index_and_filters -batch_size=32 -multiread_batched -statistics --duration=20 2>&1 | egrep 'micros/op|block.cache.filter.hit|bloom.filter.(full|use)|number.multiget' multireadrandom : 26.403 micros/op 597517 ops/sec; (548427 of 671968 found) rocksdb.block.cache.filter.hit COUNT : 83443275 rocksdb.bloom.filter.useful COUNT : 0 rocksdb.bloom.filter.full.positive COUNT : 0 rocksdb.bloom.filter.full.true.positive COUNT : 7931450 rocksdb.number.multiget.get COUNT : 385984 rocksdb.number.multiget.keys.read COUNT : 12351488 rocksdb.number.multiget.bytes.read COUNT : 793145000 rocksdb.number.multiget.keys.found COUNT : 7931450 After (middle performing run of three): $ ./db_bench_new --use-existing-db --benchmarks=multireadrandom --num=15000000 --cache_index_and_filter_blocks --bloom_bits=10 --threads=16 --cache_size=20000000 -partition_index_and_filters -batch_size=32 -multiread_batched -statistics --duration=20 2>&1 | egrep 'micros/op|block.cache.filter.hit|bloom.filter.(full|use)|number.multiget' multireadrandom : 21.024 micros/op 752963 ops/sec; (705188 of 863968 found) rocksdb.block.cache.filter.hit COUNT : 49856682 rocksdb.bloom.filter.useful COUNT : 45684579 rocksdb.bloom.filter.full.positive COUNT : 10395458 rocksdb.bloom.filter.full.true.positive COUNT : 9908456 rocksdb.number.multiget.get COUNT : 481984 rocksdb.number.multiget.keys.read COUNT : 15423488 rocksdb.number.multiget.bytes.read COUNT : 990845600 rocksdb.number.multiget.keys.found COUNT : 9908456 So that's about 25% higher throughput even for random keys Pull Request resolved: https://github.com/facebook/rocksdb/pull/6757 Test Plan: unit test included Reviewed By: anand1976 Differential Revision: D21243256 Pulled By: pdillinger fbshipit-source-id: 5644a1468d9e8c8575be02f4e04bc5d62dbbb57f
4 years ago
void PartitionedFilterBlockReader::PrefixesMayMatch(
MultiGetRange* range, const SliceTransform* prefix_extractor,
uint64_t block_offset, const bool no_io,
BlockCacheLookupContext* lookup_context) {
assert(block_offset == kNotValid);
if (!table_prefix_extractor() && !prefix_extractor) {
return; // Any/all may match
}
MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context,
&FullFilterBlockReader::PrefixesMayMatch);
}
BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
const CachableEntry<Block>& filter_block, const Slice& entry) const {
IndexBlockIter iter;
const InternalKeyComparator* const comparator = internal_comparator();
Statistics* kNullStats = nullptr;
filter_block.GetValue()->NewIndexIterator(
Separate internal and user key comparators in `BlockIter` (#6944) Summary: Replace `BlockIter::comparator_` and `IndexBlockIter::user_comparator_wrapper_` with a concrete `UserComparatorWrapper` and `InternalKeyComparator`. The motivation for this change was the inconvenience of not knowing the concrete type of `BlockIter::comparator_`, which prevented calling specialized internal key comparison functions to optimize comparison of keys with global seqno applied. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6944 Test Plan: benchmark setup -- single file DBs, in-memory, no compression. "normal_db" created by regular flush; "ingestion_db" created by ingesting a file. Both DBs have same contents. ``` $ TEST_TMPDIR=/dev/shm/normal_db/ ./db_bench -benchmarks=fillrandom,compact -write_buffer_size=10485760000 -disable_auto_compactions=true -compression_type=none -num=1000000 $ ./ldb write_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ --compression_type=no --hex --create_if_missing < <(./sst_dump --command=scan --output_hex --file=/dev/shm/normal_db/dbbench/000007.sst | awk 'began {print "0x" substr($1, 2, length($1) - 2), "==>", "0x" $5} ; /^Sst file format: block-based/ {began=1}') $ ./ldb ingest_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ ``` benchmark run command: ``` $ TEST_TMPDIR=/dev/shm/$DB/ ./db_bench -benchmarks=seekrandom -seek_nexts=$SEEK_NEXT -use_existing_db=true -cache_index_and_filter_blocks=false -num=1000000 -cache_size=0 -threads=1 -reads=200000000 -mmap_read=1 -verify_checksum=false ``` results: perf improved marginally for ingestion_db and did not change significantly for normal_db: SEEK_NEXT | DB | code | ops/sec | % change -- | -- | -- | -- | -- 0 | normal_db | master | 350880 |   0 | normal_db | PR6944 | 351040 | 0.0 0 | ingestion_db | master | 343255 |   0 | ingestion_db | PR6944 | 349424 | 1.8 10 | normal_db | master | 218711 |   10 | normal_db | PR6944 | 217892 | -0.4 10 | ingestion_db | master | 220334 |   10 | ingestion_db | PR6944 | 226437 | 2.8 Reviewed By: pdillinger Differential Revision: D21924676 Pulled By: ajkr fbshipit-source-id: ea4288a2eefa8112eb6c651a671c1de18c12e538
4 years ago
comparator->user_comparator(),
table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter,
kNullStats, true /* total_order_seek */, false /* have_first_key */,
index_key_includes_seq(), index_value_is_full());
iter.Seek(entry);
if (UNLIKELY(!iter.Valid())) {
// entry is larger than all the keys. However its prefix might still be
// present in the last partition. If this is called by PrefixMayMatch this
// is necessary for correct behavior. Otherwise it is unnecessary but safe.
// Assuming this is an unlikely case for full key search, the performance
// overhead should be negligible.
iter.SeekToLast();
}
assert(iter.Valid());
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
BlockHandle fltr_blk_handle = iter.value().handle;
return fltr_blk_handle;
}
Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
bool no_io, GetContext* get_context,
BlockCacheLookupContext* lookup_context,
Store the filter bits reader alongside the filter block contents (#5936) Summary: Amongst other things, PR https://github.com/facebook/rocksdb/issues/5504 refactored the filter block readers so that only the filter block contents are stored in the block cache (as opposed to the earlier design where the cache stored the filter block reader itself, leading to potentially dangling pointers and concurrency bugs). However, this change introduced a performance hit since with the new code, the metadata fields are re-parsed upon every access. This patch reunites the block contents with the filter bits reader to eliminate this overhead; since this is still a self-contained pure data object, it is safe to store it in the cache. (Note: this is similar to how the zstd digest is handled.) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5936 Test Plan: make asan_check filter_bench results for the old code: ``` $ ./filter_bench -quick WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.7153 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 33.4258 Single filter ns/op: 42.5974 Random filter ns/op: 217.861 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.4217 Single filter ns/op: 50.9855 Random filter ns/op: 219.167 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) $ ./filter_bench -quick -use_full_block_reader WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.5172 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 32.3556 Single filter ns/op: 83.2239 Random filter ns/op: 370.676 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.2265 Single filter ns/op: 93.5651 Random filter ns/op: 408.393 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) ``` With the new code: ``` $ ./filter_bench -quick WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 25.4285 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 31.0594 Single filter ns/op: 43.8974 Random filter ns/op: 226.075 ---------------------------- Outside queries... Dry run (25d) ns/op: 31.0295 Single filter ns/op: 50.3824 Random filter ns/op: 226.805 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) $ ./filter_bench -quick -use_full_block_reader WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.5308 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 33.2968 Single filter ns/op: 58.6163 Random filter ns/op: 291.434 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.1839 Single filter ns/op: 66.9039 Random filter ns/op: 292.828 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) ``` Differential Revision: D17991712 Pulled By: ltamasi fbshipit-source-id: 7ea205550217bfaaa1d5158ebd658e5832e60f29
5 years ago
CachableEntry<ParsedFullFilterBlock>* filter_block) const {
assert(table());
assert(filter_block);
assert(filter_block->IsEmpty());
if (!filter_map_.empty()) {
auto iter = filter_map_.find(fltr_blk_handle.offset());
// This is a possible scenario since block cache might not have had space
// for the partition
if (iter != filter_map_.end()) {
filter_block->SetUnownedValue(iter->second.GetValue());
return Status::OK();
}
}
ReadOptions read_options;
if (no_io) {
read_options.read_tier = kBlockCacheTier;
}
const Status s =
table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
UncompressionDict::GetEmptyDict(), filter_block,
Fix regression affecting partitioned indexes/filters when cache_index_and_filter_blocks is false (#5705) Summary: PR https://github.com/facebook/rocksdb/issues/5298 (and subsequent related patches) unintentionally changed the semantics of cache_index_and_filter_blocks: historically, this option only affected the main index/filter block; with the changes, it affects index/filter partitions as well. This can cause performance issues when cache_index_and_filter_blocks is false since in this case, partitions are neither cached nor preloaded (i.e. they are loaded on demand upon each access). The patch reverts to the earlier behavior, that is, partitions are cached similarly to data blocks regardless of the value of the above option. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5705 Test Plan: make check ./db_bench -benchmarks=fillrandom --statistics --stats_interval_seconds=1 --duration=30 --num=500000000 --bloom_bits=20 --partition_index_and_filters=true --cache_index_and_filter_blocks=false ./db_bench -benchmarks=readrandom --use_existing_db --statistics --stats_interval_seconds=1 --duration=10 --num=500000000 --bloom_bits=20 --partition_index_and_filters=true --cache_index_and_filter_blocks=false --cache_size=8000000000 Relevant statistics from the readrandom benchmark with the old code: rocksdb.block.cache.index.miss COUNT : 0 rocksdb.block.cache.index.hit COUNT : 0 rocksdb.block.cache.index.add COUNT : 0 rocksdb.block.cache.index.bytes.insert COUNT : 0 rocksdb.block.cache.index.bytes.evict COUNT : 0 rocksdb.block.cache.filter.miss COUNT : 0 rocksdb.block.cache.filter.hit COUNT : 0 rocksdb.block.cache.filter.add COUNT : 0 rocksdb.block.cache.filter.bytes.insert COUNT : 0 rocksdb.block.cache.filter.bytes.evict COUNT : 0 With the new code: rocksdb.block.cache.index.miss COUNT : 2500 rocksdb.block.cache.index.hit COUNT : 42696 rocksdb.block.cache.index.add COUNT : 2500 rocksdb.block.cache.index.bytes.insert COUNT : 4050048 rocksdb.block.cache.index.bytes.evict COUNT : 0 rocksdb.block.cache.filter.miss COUNT : 2500 rocksdb.block.cache.filter.hit COUNT : 4550493 rocksdb.block.cache.filter.add COUNT : 2500 rocksdb.block.cache.filter.bytes.insert COUNT : 10331040 rocksdb.block.cache.filter.bytes.evict COUNT : 0 Differential Revision: D16817382 Pulled By: ltamasi fbshipit-source-id: 28a516b0da1f041a03313e0b70b28cf5cf205d00
5 years ago
BlockType::kFilter, get_context, lookup_context,
Parallelize secondary cache lookup in MultiGet (#8405) Summary: Implement the ```WaitAll()``` interface in ```LRUCache``` to allow callers to issue multiple lookups in parallel and wait for all of them to complete. Modify ```MultiGet``` to use this to parallelize the secondary cache lookups in order to reduce the overall latency. A call to ```cache->Lookup()``` returns a handle that has an incomplete value (nullptr), and the caller can call ```cache->IsReady()``` to check whether the lookup is complete, and pass a vector of handles to ```WaitAll``` to wait for completion. If any of the lookups fail, ```MultiGet``` will read the block from the SST file. Another change in this PR is to rename ```SecondaryCacheHandle``` to ```SecondaryCacheResultHandle``` as it more accurately describes the return result of the secondary cache lookup, which is more like a future. Tests: 1. Add unit tests in lru_cache_test 2. Benchmark results with no secondary cache configured Master - ``` readrandom : 41.175 micros/op 388562 ops/sec; 106.7 MB/s (7277999 of 7277999 found) readrandom : 41.217 micros/op 388160 ops/sec; 106.6 MB/s (7274999 of 7274999 found) multireadrandom : 10.309 micros/op 1552082 ops/sec; (28908992 of 28908992 found) multireadrandom : 10.321 micros/op 1550218 ops/sec; (29081984 of 29081984 found) ``` This PR - ``` readrandom : 41.158 micros/op 388723 ops/sec; 106.8 MB/s (7290999 of 7290999 found) readrandom : 41.185 micros/op 388463 ops/sec; 106.7 MB/s (7287999 of 7287999 found) multireadrandom : 10.277 micros/op 1556801 ops/sec; (29346944 of 29346944 found) multireadrandom : 10.253 micros/op 1560539 ops/sec; (29274944 of 29274944 found) ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/8405 Reviewed By: zhichao-cao Differential Revision: D29190509 Pulled By: anand1976 fbshipit-source-id: 6f8eff6246712af8a297cfe22ea0d1c3b2a01bb0
3 years ago
/* for_compaction */ false, /* use_cache */ true,
/* wait_for_cache */ true);
return s;
}
bool PartitionedFilterBlockReader::MayMatch(
const Slice& slice, const SliceTransform* prefix_extractor,
uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
GetContext* get_context, BlockCacheLookupContext* lookup_context,
FilterFunction filter_function) const {
CachableEntry<Block> filter_block;
Status s =
GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
if (UNLIKELY(!s.ok())) {
IGNORE_STATUS_IF_ERROR(s);
return true;
}
if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
return true;
}
auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range
return false;
}
Store the filter bits reader alongside the filter block contents (#5936) Summary: Amongst other things, PR https://github.com/facebook/rocksdb/issues/5504 refactored the filter block readers so that only the filter block contents are stored in the block cache (as opposed to the earlier design where the cache stored the filter block reader itself, leading to potentially dangling pointers and concurrency bugs). However, this change introduced a performance hit since with the new code, the metadata fields are re-parsed upon every access. This patch reunites the block contents with the filter bits reader to eliminate this overhead; since this is still a self-contained pure data object, it is safe to store it in the cache. (Note: this is similar to how the zstd digest is handled.) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5936 Test Plan: make asan_check filter_bench results for the old code: ``` $ ./filter_bench -quick WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.7153 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 33.4258 Single filter ns/op: 42.5974 Random filter ns/op: 217.861 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.4217 Single filter ns/op: 50.9855 Random filter ns/op: 219.167 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) $ ./filter_bench -quick -use_full_block_reader WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.5172 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 32.3556 Single filter ns/op: 83.2239 Random filter ns/op: 370.676 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.2265 Single filter ns/op: 93.5651 Random filter ns/op: 408.393 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) ``` With the new code: ``` $ ./filter_bench -quick WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 25.4285 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 31.0594 Single filter ns/op: 43.8974 Random filter ns/op: 226.075 ---------------------------- Outside queries... Dry run (25d) ns/op: 31.0295 Single filter ns/op: 50.3824 Random filter ns/op: 226.805 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) $ ./filter_bench -quick -use_full_block_reader WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.5308 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 33.2968 Single filter ns/op: 58.6163 Random filter ns/op: 291.434 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.1839 Single filter ns/op: 66.9039 Random filter ns/op: 292.828 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) ``` Differential Revision: D17991712 Pulled By: ltamasi fbshipit-source-id: 7ea205550217bfaaa1d5158ebd658e5832e60f29
5 years ago
CachableEntry<ParsedFullFilterBlock> filter_partition_block;
s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
no_io, get_context, lookup_context,
&filter_partition_block);
if (UNLIKELY(!s.ok())) {
IGNORE_STATUS_IF_ERROR(s);
return true;
}
FullFilterBlockReader filter_partition(table(),
std::move(filter_partition_block));
return (filter_partition.*filter_function)(
slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context,
lookup_context);
}
Basic MultiGet support for partitioned filters (#6757) Summary: In MultiGet, access each applicable filter partition only once per batch, rather than for each applicable key. Also, * Fix Bloom stats for MultiGet * Fix/refactor MultiGetContext::Range::KeysLeft, including * Add efficient BitsSetToOne implementation * Assert that MultiGetContext::Range does not go beyond shift range Performance test: Generate db: $ ./db_bench --benchmarks=fillrandom --num=15000000 --cache_index_and_filter_blocks -bloom_bits=10 -partition_index_and_filters=true ... Before (middle performing run of three; note some missing Bloom stats): $ ./db_bench --use-existing-db --benchmarks=multireadrandom --num=15000000 --cache_index_and_filter_blocks --bloom_bits=10 --threads=16 --cache_size=20000000 -partition_index_and_filters -batch_size=32 -multiread_batched -statistics --duration=20 2>&1 | egrep 'micros/op|block.cache.filter.hit|bloom.filter.(full|use)|number.multiget' multireadrandom : 26.403 micros/op 597517 ops/sec; (548427 of 671968 found) rocksdb.block.cache.filter.hit COUNT : 83443275 rocksdb.bloom.filter.useful COUNT : 0 rocksdb.bloom.filter.full.positive COUNT : 0 rocksdb.bloom.filter.full.true.positive COUNT : 7931450 rocksdb.number.multiget.get COUNT : 385984 rocksdb.number.multiget.keys.read COUNT : 12351488 rocksdb.number.multiget.bytes.read COUNT : 793145000 rocksdb.number.multiget.keys.found COUNT : 7931450 After (middle performing run of three): $ ./db_bench_new --use-existing-db --benchmarks=multireadrandom --num=15000000 --cache_index_and_filter_blocks --bloom_bits=10 --threads=16 --cache_size=20000000 -partition_index_and_filters -batch_size=32 -multiread_batched -statistics --duration=20 2>&1 | egrep 'micros/op|block.cache.filter.hit|bloom.filter.(full|use)|number.multiget' multireadrandom : 21.024 micros/op 752963 ops/sec; (705188 of 863968 found) rocksdb.block.cache.filter.hit COUNT : 49856682 rocksdb.bloom.filter.useful COUNT : 45684579 rocksdb.bloom.filter.full.positive COUNT : 10395458 rocksdb.bloom.filter.full.true.positive COUNT : 9908456 rocksdb.number.multiget.get COUNT : 481984 rocksdb.number.multiget.keys.read COUNT : 15423488 rocksdb.number.multiget.bytes.read COUNT : 990845600 rocksdb.number.multiget.keys.found COUNT : 9908456 So that's about 25% higher throughput even for random keys Pull Request resolved: https://github.com/facebook/rocksdb/pull/6757 Test Plan: unit test included Reviewed By: anand1976 Differential Revision: D21243256 Pulled By: pdillinger fbshipit-source-id: 5644a1468d9e8c8575be02f4e04bc5d62dbbb57f
4 years ago
void PartitionedFilterBlockReader::MayMatch(
MultiGetRange* range, const SliceTransform* prefix_extractor,
uint64_t block_offset, bool no_io, BlockCacheLookupContext* lookup_context,
FilterManyFunction filter_function) const {
CachableEntry<Block> filter_block;
Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
lookup_context, &filter_block);
if (UNLIKELY(!s.ok())) {
IGNORE_STATUS_IF_ERROR(s);
return; // Any/all may match
}
if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
return; // Any/all may match
}
auto start_iter_same_handle = range->begin();
BlockHandle prev_filter_handle = BlockHandle::NullBlockHandle();
// For all keys mapping to same partition (must be adjacent in sorted order)
// share block cache lookup and use full filter multiget on the partition
// filter.
for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) {
// TODO: re-use one top-level index iterator
BlockHandle this_filter_handle =
GetFilterPartitionHandle(filter_block, iter->ikey);
if (!prev_filter_handle.IsNull() &&
this_filter_handle != prev_filter_handle) {
MultiGetRange subrange(*range, start_iter_same_handle, iter);
MayMatchPartition(&subrange, prefix_extractor, block_offset,
prev_filter_handle, no_io, lookup_context,
filter_function);
range->AddSkipsFrom(subrange);
start_iter_same_handle = iter;
}
if (UNLIKELY(this_filter_handle.size() == 0)) { // key is out of range
// Not reachable with current behavior of GetFilterPartitionHandle
assert(false);
range->SkipKey(iter);
prev_filter_handle = BlockHandle::NullBlockHandle();
} else {
prev_filter_handle = this_filter_handle;
}
}
if (!prev_filter_handle.IsNull()) {
MultiGetRange subrange(*range, start_iter_same_handle, range->end());
MayMatchPartition(&subrange, prefix_extractor, block_offset,
prev_filter_handle, no_io, lookup_context,
filter_function);
range->AddSkipsFrom(subrange);
}
}
void PartitionedFilterBlockReader::MayMatchPartition(
MultiGetRange* range, const SliceTransform* prefix_extractor,
uint64_t block_offset, BlockHandle filter_handle, bool no_io,
BlockCacheLookupContext* lookup_context,
FilterManyFunction filter_function) const {
CachableEntry<ParsedFullFilterBlock> filter_partition_block;
Status s = GetFilterPartitionBlock(
nullptr /* prefetch_buffer */, filter_handle, no_io,
range->begin()->get_context, lookup_context, &filter_partition_block);
if (UNLIKELY(!s.ok())) {
IGNORE_STATUS_IF_ERROR(s);
return; // Any/all may match
}
FullFilterBlockReader filter_partition(table(),
std::move(filter_partition_block));
(filter_partition.*filter_function)(range, prefix_extractor, block_offset,
no_io, lookup_context);
}
size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
size_t usage = ApproximateFilterBlockMemoryUsage();
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
#else
usage += sizeof(*this);
#endif // ROCKSDB_MALLOC_USABLE_SIZE
return usage;
// TODO(myabandeh): better estimation for filter_map_ size
}
// TODO(myabandeh): merge this with the same function in IndexReader
Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
bool pin) {
assert(table());
const BlockBasedTable::Rep* const rep = table()->get_rep();
assert(rep);
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
CachableEntry<Block> filter_block;
Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
&lookup_context, &filter_block);
if (!s.ok()) {
ROCKS_LOG_ERROR(rep->ioptions.logger,
"Error retrieving top-level filter block while trying to "
"cache filter partitions: %s",
s.ToString().c_str());
return s;
}
// Before read partitions, prefetch them to avoid lots of IOs
assert(filter_block.GetValue());
IndexBlockIter biter;
const InternalKeyComparator* const comparator = internal_comparator();
Statistics* kNullStats = nullptr;
filter_block.GetValue()->NewIndexIterator(
Separate internal and user key comparators in `BlockIter` (#6944) Summary: Replace `BlockIter::comparator_` and `IndexBlockIter::user_comparator_wrapper_` with a concrete `UserComparatorWrapper` and `InternalKeyComparator`. The motivation for this change was the inconvenience of not knowing the concrete type of `BlockIter::comparator_`, which prevented calling specialized internal key comparison functions to optimize comparison of keys with global seqno applied. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6944 Test Plan: benchmark setup -- single file DBs, in-memory, no compression. "normal_db" created by regular flush; "ingestion_db" created by ingesting a file. Both DBs have same contents. ``` $ TEST_TMPDIR=/dev/shm/normal_db/ ./db_bench -benchmarks=fillrandom,compact -write_buffer_size=10485760000 -disable_auto_compactions=true -compression_type=none -num=1000000 $ ./ldb write_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ --compression_type=no --hex --create_if_missing < <(./sst_dump --command=scan --output_hex --file=/dev/shm/normal_db/dbbench/000007.sst | awk 'began {print "0x" substr($1, 2, length($1) - 2), "==>", "0x" $5} ; /^Sst file format: block-based/ {began=1}') $ ./ldb ingest_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ ``` benchmark run command: ``` $ TEST_TMPDIR=/dev/shm/$DB/ ./db_bench -benchmarks=seekrandom -seek_nexts=$SEEK_NEXT -use_existing_db=true -cache_index_and_filter_blocks=false -num=1000000 -cache_size=0 -threads=1 -reads=200000000 -mmap_read=1 -verify_checksum=false ``` results: perf improved marginally for ingestion_db and did not change significantly for normal_db: SEEK_NEXT | DB | code | ops/sec | % change -- | -- | -- | -- | -- 0 | normal_db | master | 350880 |   0 | normal_db | PR6944 | 351040 | 0.0 0 | ingestion_db | master | 343255 |   0 | ingestion_db | PR6944 | 349424 | 1.8 10 | normal_db | master | 218711 |   10 | normal_db | PR6944 | 217892 | -0.4 10 | ingestion_db | master | 220334 |   10 | ingestion_db | PR6944 | 226437 | 2.8 Reviewed By: pdillinger Differential Revision: D21924676 Pulled By: ajkr fbshipit-source-id: ea4288a2eefa8112eb6c651a671c1de18c12e538
4 years ago
comparator->user_comparator(), rep->get_global_seqno(BlockType::kFilter),
&biter, kNullStats, true /* total_order_seek */,
false /* have_first_key */, index_key_includes_seq(),
index_value_is_full());
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
BlockHandle handle = biter.value().handle;
uint64_t prefetch_off = handle.offset();
// Read the last block's offset
biter.SeekToLast();
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
handle = biter.value().handle;
Improve / clean up meta block code & integrity (#9163) Summary: * Checksums are now checked on meta blocks unless specifically suppressed or not applicable (e.g. plain table). (Was other way around.) This means a number of cases that were not checking checksums now are, including direct read TableProperties in Version::GetTableProperties (fixed in meta_blocks ReadTableProperties), reading any block from PersistentCache (fixed in BlockFetcher), read TableProperties in SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open, maybe more. * For that to work, I moved the global_seqno+TableProperties checksum logic to the shared table/ code, because that is used by many utilies such as SstFileDumper. * Also for that to work, we have to know when we're dealing with a block that has a checksum (trailer), so added that capability to Footer based on magic number, and from there BlockFetcher. * Knowledge of trailer presence has also fixed a problem where other table formats were reading blocks including bytes for a non-existant trailer--and awkwardly kind-of not using them, e.g. no shared code checking checksums. (BlockFetcher compression type was populated incorrectly.) Now we only read what is needed. * Minimized code duplication and differing/incompatible/awkward abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block without parsing block handle) * Moved some meta block handling code from table_properties*.* * Moved some code specific to block-based table from shared table/ code to BlockBasedTable class. The checksum stuff means we can't completely separate it, but things that don't need to be in shared table/ code should not be. * Use unique_ptr rather than raw ptr in more places. (Note: you can std::move from unique_ptr to shared_ptr.) Without enhancements to GetPropertiesOfAllTablesTest (see below), net reduction of roughly 100 lines of code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163 Test Plan: existing tests and * Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that checksums are now checked on direct read of table properties by TableCache (new test would fail before this change) * Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test putting table properties under old meta name * Also generally enhanced that same test to actually test what it was supposed to be testing already, by kicking things out of table cache when we don't want them there. Reviewed By: ajkr, mrambacher Differential Revision: D32514757 Pulled By: pdillinger fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
uint64_t last_off =
handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize;
uint64_t prefetch_len = last_off - prefetch_off;
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
false /* Implicit autoreadahead */);
IOOptions opts;
s = rep->file->PrepareIOOptions(ro, opts);
if (s.ok()) {
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
3 years ago
static_cast<size_t>(prefetch_len),
ro.rate_limiter_priority);
}
if (!s.ok()) {
return s;
}
// After prefetch, read the partitions one by one
for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
handle = biter.value().handle;
Store the filter bits reader alongside the filter block contents (#5936) Summary: Amongst other things, PR https://github.com/facebook/rocksdb/issues/5504 refactored the filter block readers so that only the filter block contents are stored in the block cache (as opposed to the earlier design where the cache stored the filter block reader itself, leading to potentially dangling pointers and concurrency bugs). However, this change introduced a performance hit since with the new code, the metadata fields are re-parsed upon every access. This patch reunites the block contents with the filter bits reader to eliminate this overhead; since this is still a self-contained pure data object, it is safe to store it in the cache. (Note: this is similar to how the zstd digest is handled.) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5936 Test Plan: make asan_check filter_bench results for the old code: ``` $ ./filter_bench -quick WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.7153 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 33.4258 Single filter ns/op: 42.5974 Random filter ns/op: 217.861 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.4217 Single filter ns/op: 50.9855 Random filter ns/op: 219.167 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) $ ./filter_bench -quick -use_full_block_reader WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.5172 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 32.3556 Single filter ns/op: 83.2239 Random filter ns/op: 370.676 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.2265 Single filter ns/op: 93.5651 Random filter ns/op: 408.393 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) ``` With the new code: ``` $ ./filter_bench -quick WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 25.4285 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 31.0594 Single filter ns/op: 43.8974 Random filter ns/op: 226.075 ---------------------------- Outside queries... Dry run (25d) ns/op: 31.0295 Single filter ns/op: 50.3824 Random filter ns/op: 226.805 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) $ ./filter_bench -quick -use_full_block_reader WARNING: Assertions are enabled; benchmarks unnecessarily slow Building... Build avg ns/key: 26.5308 Number of filters: 16669 Total memory (MB): 200.009 Bits/key actual: 10.0647 ---------------------------- Inside queries... Dry run (46b) ns/op: 33.2968 Single filter ns/op: 58.6163 Random filter ns/op: 291.434 ---------------------------- Outside queries... Dry run (25d) ns/op: 32.1839 Single filter ns/op: 66.9039 Random filter ns/op: 292.828 Average FP rate %: 1.13993 ---------------------------- Done. (For more info, run with -legend or -help.) ``` Differential Revision: D17991712 Pulled By: ltamasi fbshipit-source-id: 7ea205550217bfaaa1d5158ebd658e5832e60f29
5 years ago
CachableEntry<ParsedFullFilterBlock> block;
// TODO: Support counter batch update for partitioned index and
// filter blocks
s = table()->MaybeReadBlockAndLoadToCache(
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
Parallelize secondary cache lookup in MultiGet (#8405) Summary: Implement the ```WaitAll()``` interface in ```LRUCache``` to allow callers to issue multiple lookups in parallel and wait for all of them to complete. Modify ```MultiGet``` to use this to parallelize the secondary cache lookups in order to reduce the overall latency. A call to ```cache->Lookup()``` returns a handle that has an incomplete value (nullptr), and the caller can call ```cache->IsReady()``` to check whether the lookup is complete, and pass a vector of handles to ```WaitAll``` to wait for completion. If any of the lookups fail, ```MultiGet``` will read the block from the SST file. Another change in this PR is to rename ```SecondaryCacheHandle``` to ```SecondaryCacheResultHandle``` as it more accurately describes the return result of the secondary cache lookup, which is more like a future. Tests: 1. Add unit tests in lru_cache_test 2. Benchmark results with no secondary cache configured Master - ``` readrandom : 41.175 micros/op 388562 ops/sec; 106.7 MB/s (7277999 of 7277999 found) readrandom : 41.217 micros/op 388160 ops/sec; 106.6 MB/s (7274999 of 7274999 found) multireadrandom : 10.309 micros/op 1552082 ops/sec; (28908992 of 28908992 found) multireadrandom : 10.321 micros/op 1550218 ops/sec; (29081984 of 29081984 found) ``` This PR - ``` readrandom : 41.158 micros/op 388723 ops/sec; 106.8 MB/s (7290999 of 7290999 found) readrandom : 41.185 micros/op 388463 ops/sec; 106.7 MB/s (7287999 of 7287999 found) multireadrandom : 10.277 micros/op 1556801 ops/sec; (29346944 of 29346944 found) multireadrandom : 10.253 micros/op 1560539 ops/sec; (29274944 of 29274944 found) ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/8405 Reviewed By: zhichao-cao Differential Revision: D29190509 Pulled By: anand1976 fbshipit-source-id: 6f8eff6246712af8a297cfe22ea0d1c3b2a01bb0
3 years ago
/* wait */ true, &block, BlockType::kFilter, nullptr /* get_context */,
&lookup_context, nullptr /* contents */);
if (!s.ok()) {
return s;
}
assert(s.ok() || block.GetValue() == nullptr);
if (block.GetValue() != nullptr) {
if (block.IsCached()) {
if (pin) {
filter_map_[handle.offset()] = std::move(block);
}
}
}
}
return biter.status();
}
const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
const {
assert(table());
assert(table()->get_rep());
return &table()->get_rep()->internal_comparator;
}
bool PartitionedFilterBlockReader::index_key_includes_seq() const {
assert(table());
assert(table()->get_rep());
return table()->get_rep()->index_key_includes_seq;
}
bool PartitionedFilterBlockReader::index_value_is_full() const {
assert(table());
assert(table()->get_rep());
return table()->get_rep()->index_value_is_full;
}
} // namespace ROCKSDB_NAMESPACE