|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "table/block_based/partitioned_filter_block.h"
|
|
|
|
|
|
|
|
#include <utility>
|
|
|
|
|
|
|
|
#include "file/random_access_file_reader.h"
|
|
|
|
#include "logging/logging.h"
|
|
|
|
#include "monitoring/perf_context_imp.h"
|
|
|
|
#include "port/malloc.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/filter_policy.h"
|
|
|
|
#include "table/block_based/block.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
|
|
|
|
const SliceTransform* _prefix_extractor, bool whole_key_filtering,
|
|
|
|
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
|
|
|
|
const bool use_value_delta_encoding,
|
|
|
|
PartitionedIndexBuilder* const p_index_builder,
|
|
|
|
const uint32_t partition_size)
|
|
|
|
: FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
|
|
|
|
filter_bits_builder),
|
|
|
|
index_on_filter_block_builder_(index_block_restart_interval,
|
|
|
|
true /*use_delta_encoding*/,
|
|
|
|
use_value_delta_encoding),
|
|
|
|
index_on_filter_block_builder_without_seq_(index_block_restart_interval,
|
|
|
|
true /*use_delta_encoding*/,
|
|
|
|
use_value_delta_encoding),
|
|
|
|
p_index_builder_(p_index_builder),
|
|
|
|
keys_added_to_partition_(0),
|
|
|
|
total_added_in_built_(0) {
|
|
|
|
keys_per_partition_ = static_cast<uint32_t>(
|
|
|
|
filter_bits_builder_->ApproximateNumEntries(partition_size));
|
|
|
|
if (keys_per_partition_ < 1) {
|
|
|
|
// partition_size (minus buffer, ~10%) might be smaller than minimum
|
|
|
|
// filter size, sometimes based on cache line size. Try to find that
|
|
|
|
// minimum size without CalculateSpace (not necessarily available).
|
|
|
|
uint32_t larger = std::max(partition_size + 4, uint32_t{16});
|
|
|
|
for (;;) {
|
|
|
|
keys_per_partition_ = static_cast<uint32_t>(
|
|
|
|
filter_bits_builder_->ApproximateNumEntries(larger));
|
|
|
|
if (keys_per_partition_ >= 1) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
larger += larger / 4;
|
|
|
|
if (larger > 100000) {
|
|
|
|
// might be a broken implementation. substitute something reasonable:
|
|
|
|
// 1 key / byte.
|
|
|
|
keys_per_partition_ = partition_size;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
|
|
|
|
|
|
|
|
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
|
|
|
|
const Slice* next_key) {
|
|
|
|
// Use == to send the request only once
|
|
|
|
if (keys_added_to_partition_ == keys_per_partition_) {
|
|
|
|
// Currently only index builder is in charge of cutting a partition. We keep
|
|
|
|
// requesting until it is granted.
|
|
|
|
p_index_builder_->RequestPartitionCut();
|
|
|
|
}
|
|
|
|
if (!p_index_builder_->ShouldCutFilterBlock()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Fix a bug for SeekForPrev with partitioned filter and prefix (#8137)
Summary:
According to https://github.com/facebook/rocksdb/issues/5907, each filter partition "should include the bloom of the prefix of the last
key in the previous partition" so that SeekForPrev() in prefix mode can return correct result.
The prefix of the last key in the previous partition does not necessarily have the same prefix
as the first key in the current partition. Regardless of the first key in current partition, the
prefix of the last key in the previous partition should be added. The existing code, however,
does not follow this. Furthermore, there is another issue: when finishing current filter partition,
`FullFilterBlockBuilder::AddPrefix()` is called for the first key in next filter partition, which effectively
overwrites `last_prefix_str_` prematurely. Consequently, when the filter block builder proceeds
to the next partition, `last_prefix_str_` will be the prefix of its first key, leaving no way of adding
the bloom of the prefix of the last key of the previous partition.
Prefix extractor is FixedLength.2.
```
[ filter part 1 ] [ filter part 2 ]
abc d
```
When SeekForPrev("abcd"), checking the filter partition will land on filter part 2 because "abcd" > "abc"
but smaller than "d".
If the filter in filter part 2 happens to return false for the test for "ab", then SeekForPrev("abcd") will build
incorrect iterator tree in non-total-order mode.
Also fix a unit test which starts to fail following this PR. `InDomain` should not fail due to assertion
error when checking on an arbitrary key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8137
Test Plan:
```
make check
```
Without this fix, the following command will fail pretty soon.
```
./db_stress --acquire_snapshot_one_in=10000 --avoid_flush_during_recovery=0 \
--avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 \
--batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=17 \
--bottommost_compression_type=disable --cache_index_and_filter_blocks=1 --cache_size=1048576 \
--checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 \
--compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_ttl=0 \
--compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 \
--compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 \
--continuous_verification_interval=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox \
--db_write_buffer_size=8388608 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --enable_blob_files=0 \
--enable_compaction_filter=0 --enable_pipelined_write=1 --file_checksum_impl=big --flush_one_in=1000000 \
--format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 \
--get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --index_type=2 --ingest_external_file_one_in=0 \
--iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True \
--log2_keys_per_lock=10 --long_running_snapshots=1 --mark_for_compaction_one_file_in=0 \
--max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000000 --max_key_len=3 \
--max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 \
--max_write_buffer_size_to_maintain=8388608 --memtablerep=skip_list --mmap_read=1 --mock_direct_io=False \
--nooverwritepercent=0 --open_files=500000 --ops_per_thread=20000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=1 --partition_pinning=0 --pause_background_one_in=1000000 \
--periodic_compaction_seconds=0 --prefixpercent=5 --progress_reports=0 --read_fault_one_in=0 --read_only=0 \
--readpercent=45 --recycle_log_file_num=0 --reopen=20 --secondary_catch_up_one_in=0 \
--snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 \
--sst_file_manager_bytes_per_truncate=0 --subcompactions=2 --sync=0 --sync_fault_injection=False \
--target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=0 \
--top_level_index_pinning=0 --unpartitioned_pinning=1 --use_blob_db=0 --use_block_based_filter=0 \
--use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 \
--use_multiget=0 --use_ribbon_filter=0 --use_txn=0 --user_timestamp_size=8 --verify_checksum=1 \
--verify_checksum_one_in=1000000 --verify_db_one_in=100000 --write_buffer_size=4194304 \
--write_dbid_to_manifest=1 --writepercent=35
```
Reviewed By: pdillinger
Differential Revision: D27553054
Pulled By: riversand963
fbshipit-source-id: 60e391e4a2d8d98a9a3172ec5d6176b90ec3de98
4 years ago
|
|
|
// Add the prefix of the next key before finishing the partition without
|
|
|
|
// updating last_prefix_str_. This hack, fixes a bug with format_verison=3
|
|
|
|
// where seeking for the prefix would lead us to the previous partition.
|
|
|
|
const bool maybe_add_prefix =
|
|
|
|
next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
|
Fix a bug for SeekForPrev with partitioned filter and prefix (#8137)
Summary:
According to https://github.com/facebook/rocksdb/issues/5907, each filter partition "should include the bloom of the prefix of the last
key in the previous partition" so that SeekForPrev() in prefix mode can return correct result.
The prefix of the last key in the previous partition does not necessarily have the same prefix
as the first key in the current partition. Regardless of the first key in current partition, the
prefix of the last key in the previous partition should be added. The existing code, however,
does not follow this. Furthermore, there is another issue: when finishing current filter partition,
`FullFilterBlockBuilder::AddPrefix()` is called for the first key in next filter partition, which effectively
overwrites `last_prefix_str_` prematurely. Consequently, when the filter block builder proceeds
to the next partition, `last_prefix_str_` will be the prefix of its first key, leaving no way of adding
the bloom of the prefix of the last key of the previous partition.
Prefix extractor is FixedLength.2.
```
[ filter part 1 ] [ filter part 2 ]
abc d
```
When SeekForPrev("abcd"), checking the filter partition will land on filter part 2 because "abcd" > "abc"
but smaller than "d".
If the filter in filter part 2 happens to return false for the test for "ab", then SeekForPrev("abcd") will build
incorrect iterator tree in non-total-order mode.
Also fix a unit test which starts to fail following this PR. `InDomain` should not fail due to assertion
error when checking on an arbitrary key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8137
Test Plan:
```
make check
```
Without this fix, the following command will fail pretty soon.
```
./db_stress --acquire_snapshot_one_in=10000 --avoid_flush_during_recovery=0 \
--avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 \
--batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=17 \
--bottommost_compression_type=disable --cache_index_and_filter_blocks=1 --cache_size=1048576 \
--checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 \
--compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_ttl=0 \
--compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 \
--compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 \
--continuous_verification_interval=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox \
--db_write_buffer_size=8388608 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --enable_blob_files=0 \
--enable_compaction_filter=0 --enable_pipelined_write=1 --file_checksum_impl=big --flush_one_in=1000000 \
--format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 \
--get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --index_type=2 --ingest_external_file_one_in=0 \
--iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True \
--log2_keys_per_lock=10 --long_running_snapshots=1 --mark_for_compaction_one_file_in=0 \
--max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000000 --max_key_len=3 \
--max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 \
--max_write_buffer_size_to_maintain=8388608 --memtablerep=skip_list --mmap_read=1 --mock_direct_io=False \
--nooverwritepercent=0 --open_files=500000 --ops_per_thread=20000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=1 --partition_pinning=0 --pause_background_one_in=1000000 \
--periodic_compaction_seconds=0 --prefixpercent=5 --progress_reports=0 --read_fault_one_in=0 --read_only=0 \
--readpercent=45 --recycle_log_file_num=0 --reopen=20 --secondary_catch_up_one_in=0 \
--snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 \
--sst_file_manager_bytes_per_truncate=0 --subcompactions=2 --sync=0 --sync_fault_injection=False \
--target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=0 \
--top_level_index_pinning=0 --unpartitioned_pinning=1 --use_blob_db=0 --use_block_based_filter=0 \
--use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 \
--use_multiget=0 --use_ribbon_filter=0 --use_txn=0 --user_timestamp_size=8 --verify_checksum=1 \
--verify_checksum_one_in=1000000 --verify_db_one_in=100000 --write_buffer_size=4194304 \
--write_dbid_to_manifest=1 --writepercent=35
```
Reviewed By: pdillinger
Differential Revision: D27553054
Pulled By: riversand963
fbshipit-source-id: 60e391e4a2d8d98a9a3172ec5d6176b90ec3de98
4 years ago
|
|
|
if (maybe_add_prefix) {
|
|
|
|
const Slice next_key_prefix = prefix_extractor()->Transform(*next_key);
|
|
|
|
if (next_key_prefix.compare(last_prefix_str()) != 0) {
|
|
|
|
AddKey(next_key_prefix);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded();
|
|
|
|
std::unique_ptr<const char[]> filter_data;
|
|
|
|
Slice filter = filter_bits_builder_->Finish(&filter_data);
|
|
|
|
std::string& index_key = p_index_builder_->GetPartitionKey();
|
|
|
|
filters.push_back({index_key, filter, std::move(filter_data)});
|
|
|
|
keys_added_to_partition_ = 0;
|
|
|
|
Reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedFilterBlockBuilder::Add(const Slice& key) {
|
|
|
|
MaybeCutAFilterBlock(&key);
|
|
|
|
FullFilterBlockBuilder::Add(key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
|
|
|
|
FullFilterBlockBuilder::AddKey(key);
|
|
|
|
keys_added_to_partition_++;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
|
|
|
|
return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice PartitionedFilterBlockBuilder::Finish(
|
|
|
|
const BlockHandle& last_partition_block_handle, Status* status,
|
|
|
|
std::unique_ptr<const char[]>* filter_data) {
|
|
|
|
if (finishing_filters == true) {
|
|
|
|
// Record the handle of the last written filter block in the index
|
|
|
|
std::string handle_encoding;
|
|
|
|
last_partition_block_handle.EncodeTo(&handle_encoding);
|
|
|
|
std::string handle_delta_encoding;
|
|
|
|
PutVarsignedint64(
|
|
|
|
&handle_delta_encoding,
|
|
|
|
last_partition_block_handle.size() - last_encoded_handle_.size());
|
|
|
|
last_encoded_handle_ = last_partition_block_handle;
|
|
|
|
const Slice handle_delta_encoding_slice(handle_delta_encoding);
|
|
|
|
index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding,
|
|
|
|
&handle_delta_encoding_slice);
|
|
|
|
if (!p_index_builder_->seperator_is_key_plus_seq()) {
|
|
|
|
index_on_filter_block_builder_without_seq_.Add(
|
|
|
|
ExtractUserKey(last_filter_entry_key), handle_encoding,
|
|
|
|
&handle_delta_encoding_slice);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
MaybeCutAFilterBlock(nullptr);
|
|
|
|
}
|
|
|
|
// If there is no filter partition left, then return the index on filter
|
|
|
|
// partitions
|
|
|
|
if (UNLIKELY(filters.empty())) {
|
|
|
|
*status = Status::OK();
|
|
|
|
last_filter_data.reset();
|
|
|
|
if (finishing_filters) {
|
|
|
|
// Simplest to just add them all at the end
|
|
|
|
total_added_in_built_ = 0;
|
|
|
|
if (p_index_builder_->seperator_is_key_plus_seq()) {
|
|
|
|
return index_on_filter_block_builder_.Finish();
|
|
|
|
} else {
|
|
|
|
return index_on_filter_block_builder_without_seq_.Finish();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// This is the rare case where no key was added to the filter
|
|
|
|
return Slice();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Return the next filter partition in line and set Incomplete() status to
|
|
|
|
// indicate we expect more calls to Finish
|
|
|
|
*status = Status::Incomplete();
|
|
|
|
finishing_filters = true;
|
|
|
|
|
|
|
|
last_filter_entry_key = filters.front().key;
|
|
|
|
Slice filter = filters.front().filter;
|
|
|
|
last_filter_data = std::move(filters.front().filter_data);
|
|
|
|
if (filter_data != nullptr) {
|
|
|
|
*filter_data = std::move(last_filter_data);
|
|
|
|
}
|
|
|
|
filters.pop_front();
|
|
|
|
return filter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
PartitionedFilterBlockReader::PartitionedFilterBlockReader(
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
|
|
|
|
: FilterBlockReaderCommon(t, std::move(filter_block)) {}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
|
|
|
|
const BlockBasedTable* table, const ReadOptions& ro,
|
|
|
|
FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
|
|
|
|
bool pin, BlockCacheLookupContext* lookup_context) {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
assert(table);
|
|
|
|
assert(table->get_rep());
|
|
|
|
assert(!pin || prefetch);
|
|
|
|
|
|
|
|
CachableEntry<Block> filter_block;
|
|
|
|
if (prefetch || !use_cache) {
|
|
|
|
const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
|
|
|
|
nullptr /* get_context */, lookup_context,
|
|
|
|
&filter_block);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
if (!s.ok()) {
|
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
return std::unique_ptr<FilterBlockReader>();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (use_cache && !pin) {
|
|
|
|
filter_block.Reset();
|
|
|
|
}
|
|
|
|
}
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
return std::unique_ptr<FilterBlockReader>(
|
|
|
|
new PartitionedFilterBlockReader(table, std::move(filter_block)));
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PartitionedFilterBlockReader::KeyMayMatch(
|
|
|
|
const Slice& key, const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context) {
|
|
|
|
assert(const_ikey_ptr != nullptr);
|
|
|
|
assert(block_offset == kNotValid);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
if (!whole_key_filtering()) {
|
|
|
|
return true;
|
|
|
|
}
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr,
|
|
|
|
get_context, lookup_context,
|
|
|
|
&FullFilterBlockReader::KeyMayMatch);
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedFilterBlockReader::KeysMayMatch(
|
|
|
|
MultiGetRange* range, const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset, const bool no_io,
|
|
|
|
BlockCacheLookupContext* lookup_context) {
|
|
|
|
assert(block_offset == kNotValid);
|
|
|
|
if (!whole_key_filtering()) {
|
|
|
|
return; // Any/all may match
|
|
|
|
}
|
|
|
|
|
|
|
|
MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context,
|
|
|
|
&FullFilterBlockReader::KeysMayMatch);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PartitionedFilterBlockReader::PrefixMayMatch(
|
|
|
|
const Slice& prefix, const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context) {
|
|
|
|
assert(const_ikey_ptr != nullptr);
|
|
|
|
assert(block_offset == kNotValid);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
if (!table_prefix_extractor() && !prefix_extractor) {
|
|
|
|
return true;
|
|
|
|
}
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr,
|
|
|
|
get_context, lookup_context,
|
|
|
|
&FullFilterBlockReader::PrefixMayMatch);
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedFilterBlockReader::PrefixesMayMatch(
|
|
|
|
MultiGetRange* range, const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset, const bool no_io,
|
|
|
|
BlockCacheLookupContext* lookup_context) {
|
|
|
|
assert(block_offset == kNotValid);
|
|
|
|
if (!table_prefix_extractor() && !prefix_extractor) {
|
|
|
|
return; // Any/all may match
|
|
|
|
}
|
|
|
|
|
|
|
|
MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context,
|
|
|
|
&FullFilterBlockReader::PrefixesMayMatch);
|
|
|
|
}
|
|
|
|
|
|
|
|
BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
const CachableEntry<Block>& filter_block, const Slice& entry) const {
|
|
|
|
IndexBlockIter iter;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
const InternalKeyComparator* const comparator = internal_comparator();
|
|
|
|
Statistics* kNullStats = nullptr;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
filter_block.GetValue()->NewIndexIterator(
|
|
|
|
comparator->user_comparator(),
|
|
|
|
table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter,
|
|
|
|
kNullStats, true /* total_order_seek */, false /* have_first_key */,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
index_key_includes_seq(), index_value_is_full());
|
|
|
|
iter.Seek(entry);
|
|
|
|
if (UNLIKELY(!iter.Valid())) {
|
|
|
|
// entry is larger than all the keys. However its prefix might still be
|
|
|
|
// present in the last partition. If this is called by PrefixMayMatch this
|
|
|
|
// is necessary for correct behavior. Otherwise it is unnecessary but safe.
|
|
|
|
// Assuming this is an unlikely case for full key search, the performance
|
|
|
|
// overhead should be negligible.
|
|
|
|
iter.SeekToLast();
|
|
|
|
}
|
|
|
|
assert(iter.Valid());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
BlockHandle fltr_blk_handle = iter.value().handle;
|
|
|
|
return fltr_blk_handle;
|
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
|
|
|
|
FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
|
|
|
|
bool no_io, GetContext* get_context,
|
|
|
|
BlockCacheLookupContext* lookup_context,
|
|
|
|
CachableEntry<ParsedFullFilterBlock>* filter_block) const {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
assert(table());
|
|
|
|
assert(filter_block);
|
|
|
|
assert(filter_block->IsEmpty());
|
|
|
|
|
|
|
|
if (!filter_map_.empty()) {
|
|
|
|
auto iter = filter_map_.find(fltr_blk_handle.offset());
|
|
|
|
// This is a possible scenario since block cache might not have had space
|
|
|
|
// for the partition
|
|
|
|
if (iter != filter_map_.end()) {
|
|
|
|
filter_block->SetUnownedValue(iter->second.GetValue());
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
ReadOptions read_options;
|
|
|
|
if (no_io) {
|
|
|
|
read_options.read_tier = kBlockCacheTier;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Status s =
|
|
|
|
table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
|
|
|
|
UncompressionDict::GetEmptyDict(), filter_block,
|
|
|
|
BlockType::kFilter, get_context, lookup_context,
|
|
|
|
/* for_compaction */ false, /* use_cache */ true,
|
|
|
|
/* wait_for_cache */ true);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PartitionedFilterBlockReader::MayMatch(
|
|
|
|
const Slice& slice, const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
|
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
|
|
|
FilterFunction filter_function) const {
|
|
|
|
CachableEntry<Block> filter_block;
|
|
|
|
Status s =
|
|
|
|
GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
|
|
|
|
if (UNLIKELY(!s.ok())) {
|
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
|
|
|
|
if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
CachableEntry<ParsedFullFilterBlock> filter_partition_block;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
|
|
|
|
no_io, get_context, lookup_context,
|
|
|
|
&filter_partition_block);
|
|
|
|
if (UNLIKELY(!s.ok())) {
|
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
FullFilterBlockReader filter_partition(table(),
|
|
|
|
std::move(filter_partition_block));
|
|
|
|
return (filter_partition.*filter_function)(
|
|
|
|
slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context,
|
|
|
|
lookup_context);
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedFilterBlockReader::MayMatch(
|
|
|
|
MultiGetRange* range, const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset, bool no_io, BlockCacheLookupContext* lookup_context,
|
|
|
|
FilterManyFunction filter_function) const {
|
|
|
|
CachableEntry<Block> filter_block;
|
|
|
|
Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
|
|
|
|
lookup_context, &filter_block);
|
|
|
|
if (UNLIKELY(!s.ok())) {
|
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
|
|
|
return; // Any/all may match
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
|
|
|
|
return; // Any/all may match
|
|
|
|
}
|
|
|
|
|
|
|
|
auto start_iter_same_handle = range->begin();
|
|
|
|
BlockHandle prev_filter_handle = BlockHandle::NullBlockHandle();
|
|
|
|
|
|
|
|
// For all keys mapping to same partition (must be adjacent in sorted order)
|
|
|
|
// share block cache lookup and use full filter multiget on the partition
|
|
|
|
// filter.
|
|
|
|
for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) {
|
|
|
|
// TODO: re-use one top-level index iterator
|
|
|
|
BlockHandle this_filter_handle =
|
|
|
|
GetFilterPartitionHandle(filter_block, iter->ikey);
|
|
|
|
if (!prev_filter_handle.IsNull() &&
|
|
|
|
this_filter_handle != prev_filter_handle) {
|
|
|
|
MultiGetRange subrange(*range, start_iter_same_handle, iter);
|
|
|
|
MayMatchPartition(&subrange, prefix_extractor, block_offset,
|
|
|
|
prev_filter_handle, no_io, lookup_context,
|
|
|
|
filter_function);
|
|
|
|
range->AddSkipsFrom(subrange);
|
|
|
|
start_iter_same_handle = iter;
|
|
|
|
}
|
|
|
|
if (UNLIKELY(this_filter_handle.size() == 0)) { // key is out of range
|
|
|
|
// Not reachable with current behavior of GetFilterPartitionHandle
|
|
|
|
assert(false);
|
|
|
|
range->SkipKey(iter);
|
|
|
|
prev_filter_handle = BlockHandle::NullBlockHandle();
|
|
|
|
} else {
|
|
|
|
prev_filter_handle = this_filter_handle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!prev_filter_handle.IsNull()) {
|
|
|
|
MultiGetRange subrange(*range, start_iter_same_handle, range->end());
|
|
|
|
MayMatchPartition(&subrange, prefix_extractor, block_offset,
|
|
|
|
prev_filter_handle, no_io, lookup_context,
|
|
|
|
filter_function);
|
|
|
|
range->AddSkipsFrom(subrange);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedFilterBlockReader::MayMatchPartition(
|
|
|
|
MultiGetRange* range, const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset, BlockHandle filter_handle, bool no_io,
|
|
|
|
BlockCacheLookupContext* lookup_context,
|
|
|
|
FilterManyFunction filter_function) const {
|
|
|
|
CachableEntry<ParsedFullFilterBlock> filter_partition_block;
|
|
|
|
Status s = GetFilterPartitionBlock(
|
|
|
|
nullptr /* prefetch_buffer */, filter_handle, no_io,
|
|
|
|
range->begin()->get_context, lookup_context, &filter_partition_block);
|
|
|
|
if (UNLIKELY(!s.ok())) {
|
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
|
|
|
return; // Any/all may match
|
|
|
|
}
|
|
|
|
|
|
|
|
FullFilterBlockReader filter_partition(table(),
|
|
|
|
std::move(filter_partition_block));
|
|
|
|
(filter_partition.*filter_function)(range, prefix_extractor, block_offset,
|
|
|
|
no_io, lookup_context);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
size_t usage = ApproximateFilterBlockMemoryUsage();
|
|
|
|
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
|
|
|
|
#else
|
|
|
|
usage += sizeof(*this);
|
|
|
|
#endif // ROCKSDB_MALLOC_USABLE_SIZE
|
|
|
|
return usage;
|
|
|
|
// TODO(myabandeh): better estimation for filter_map_ size
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(myabandeh): merge this with the same function in IndexReader
|
|
|
|
Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
|
|
|
|
bool pin) {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
assert(table());
|
|
|
|
|
|
|
|
const BlockBasedTable::Rep* const rep = table()->get_rep();
|
|
|
|
assert(rep);
|
|
|
|
|
|
|
|
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
CachableEntry<Block> filter_block;
|
|
|
|
|
|
|
|
Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
|
|
|
|
&lookup_context, &filter_block);
|
|
|
|
if (!s.ok()) {
|
|
|
|
ROCKS_LOG_ERROR(rep->ioptions.logger,
|
|
|
|
"Error retrieving top-level filter block while trying to "
|
|
|
|
"cache filter partitions: %s",
|
|
|
|
s.ToString().c_str());
|
|
|
|
return s;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// Before read partitions, prefetch them to avoid lots of IOs
|
|
|
|
assert(filter_block.GetValue());
|
|
|
|
|
|
|
|
IndexBlockIter biter;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
const InternalKeyComparator* const comparator = internal_comparator();
|
|
|
|
Statistics* kNullStats = nullptr;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
filter_block.GetValue()->NewIndexIterator(
|
|
|
|
comparator->user_comparator(), rep->get_global_seqno(BlockType::kFilter),
|
|
|
|
&biter, kNullStats, true /* total_order_seek */,
|
|
|
|
false /* have_first_key */, index_key_includes_seq(),
|
|
|
|
index_value_is_full());
|
|
|
|
// Index partitions are assumed to be consecuitive. Prefetch them all.
|
|
|
|
// Read the first block offset
|
|
|
|
biter.SeekToFirst();
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
BlockHandle handle = biter.value().handle;
|
|
|
|
uint64_t prefetch_off = handle.offset();
|
|
|
|
|
|
|
|
// Read the last block's offset
|
|
|
|
biter.SeekToLast();
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
handle = biter.value().handle;
|
|
|
|
uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
|
|
|
|
uint64_t prefetch_len = last_off - prefetch_off;
|
|
|
|
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
|
|
|
|
rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
|
|
|
|
false /* Implicit autoreadahead */);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
IOOptions opts;
|
|
|
|
s = rep->file->PrepareIOOptions(ro, opts);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
|
|
|
|
static_cast<size_t>(prefetch_len));
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// After prefetch, read the partitions one by one
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
handle = biter.value().handle;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
|
|
|
|
CachableEntry<ParsedFullFilterBlock> block;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
// TODO: Support counter batch update for partitioned index and
|
|
|
|
// filter blocks
|
|
|
|
s = table()->MaybeReadBlockAndLoadToCache(
|
|
|
|
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
|
|
|
|
/* wait */ true, &block, BlockType::kFilter, nullptr /* get_context */,
|
|
|
|
&lookup_context, nullptr /* contents */);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
assert(s.ok() || block.GetValue() == nullptr);
|
|
|
|
|
|
|
|
if (block.GetValue() != nullptr) {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
if (block.IsCached()) {
|
|
|
|
if (pin) {
|
|
|
|
filter_map_[handle.offset()] = std::move(block);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return biter.status();
|
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
|
|
|
|
const {
|
|
|
|
assert(table());
|
|
|
|
assert(table()->get_rep());
|
|
|
|
|
|
|
|
return &table()->get_rep()->internal_comparator;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PartitionedFilterBlockReader::index_key_includes_seq() const {
|
|
|
|
assert(table());
|
|
|
|
assert(table()->get_rep());
|
|
|
|
|
|
|
|
return table()->get_rep()->index_key_includes_seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PartitionedFilterBlockReader::index_value_is_full() const {
|
|
|
|
assert(table());
|
|
|
|
assert(table()->get_rep());
|
|
|
|
|
|
|
|
return table()->get_rep()->index_value_is_full;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|