|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <memory>
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
#include "cache/cache_entry_roles.h"
|
|
|
|
#include "cache/cache_reservation_manager.h"
|
|
|
|
#include "logging/logging.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
#include "options/options_helper.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/cache.h"
|
|
|
|
#include "rocksdb/convenience.h"
|
|
|
|
#include "rocksdb/filter_policy.h"
|
|
|
|
#include "rocksdb/flush_block_policy.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
#include "rocksdb/rocksdb_namespace.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "rocksdb/utilities/options_type.h"
|
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
#include "util/mutexlock.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
void TailPrefetchStats::RecordEffectiveSize(size_t len) {
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
if (num_records_ < kNumTracked) {
|
|
|
|
num_records_++;
|
|
|
|
}
|
|
|
|
records_[next_++] = len;
|
|
|
|
if (next_ == kNumTracked) {
|
|
|
|
next_ = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
|
|
|
|
std::vector<size_t> sorted;
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
|
|
|
|
if (num_records_ == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
sorted.assign(records_, records_ + num_records_);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Of the historic size, we find the maximum one that satisifis the condtiion
|
|
|
|
// that if prefetching all, less than 1/8 will be wasted.
|
|
|
|
std::sort(sorted.begin(), sorted.end());
|
|
|
|
|
|
|
|
// Assuming we have 5 data points, and after sorting it looks like this:
|
|
|
|
//
|
|
|
|
// +---+
|
|
|
|
// +---+ | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// +---+ | | | |
|
|
|
|
// | | | | | |
|
|
|
|
// +---+ | | | | | |
|
|
|
|
// | | | | | | | |
|
|
|
|
// +---+ | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// +---+ +---+ +---+ +---+ +---+
|
|
|
|
//
|
|
|
|
// and we use every of the value as a candidate, and estimate how much we
|
|
|
|
// wasted, compared to read. For example, when we use the 3rd record
|
|
|
|
// as candiate. This area is what we read:
|
|
|
|
// +---+
|
|
|
|
// +---+ | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// *** *** *** ***+ *** *** *** *** **
|
|
|
|
// * | | | | | |
|
|
|
|
// +---+ | | | | | *
|
|
|
|
// * | | | | | | | |
|
|
|
|
// +---+ | | | | | | | *
|
|
|
|
// * | | | | X | | | | |
|
|
|
|
// | | | | | | | | | *
|
|
|
|
// * | | | | | | | | |
|
|
|
|
// | | | | | | | | | *
|
|
|
|
// * | | | | | | | | |
|
|
|
|
// *** *** ***-*** ***--*** ***--*** +****
|
|
|
|
// which is (size of the record) X (number of records).
|
|
|
|
//
|
|
|
|
// While wasted is this area:
|
|
|
|
// +---+
|
|
|
|
// +---+ | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// *** *** *** ****---+ | | | |
|
|
|
|
// * * | | | | |
|
|
|
|
// * *-*** *** | | | | |
|
|
|
|
// * * | | | | | | |
|
|
|
|
// *--** *** | | | | | | |
|
|
|
|
// | | | | | X | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// +---+ +---+ +---+ +---+ +---+
|
|
|
|
//
|
|
|
|
// Which can be calculated iteratively.
|
|
|
|
// The difference between wasted using 4st and 3rd record, will
|
|
|
|
// be following area:
|
|
|
|
// +---+
|
|
|
|
// +--+ +-+ ++ +-+ +-+ +---+ | |
|
|
|
|
// + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// | xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// +-+ +-+ +-+ ++ +---+ +--+ | | |
|
|
|
|
// | | | | | | |
|
|
|
|
// +---+ ++ | | | | | |
|
|
|
|
// | | | | | | X | | |
|
|
|
|
// +---+ ++ | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// +---+ +---+ +---+ +---+ +---+
|
|
|
|
//
|
|
|
|
// which will be the size difference between 4st and 3rd record,
|
|
|
|
// times 3, which is number of records before the 4st.
|
|
|
|
// Here we assume that all data within the prefetch range will be useful. In
|
|
|
|
// reality, it may not be the case when a partial block is inside the range,
|
|
|
|
// or there are data in the middle that is not read. We ignore those cases
|
|
|
|
// for simplicity.
|
|
|
|
assert(!sorted.empty());
|
|
|
|
size_t prev_size = sorted[0];
|
|
|
|
size_t max_qualified_size = sorted[0];
|
|
|
|
size_t wasted = 0;
|
|
|
|
for (size_t i = 1; i < sorted.size(); i++) {
|
|
|
|
size_t read = sorted[i] * sorted.size();
|
|
|
|
wasted += (sorted[i] - prev_size) * i;
|
|
|
|
if (wasted <= read / 8) {
|
|
|
|
max_qualified_size = sorted[i];
|
|
|
|
}
|
|
|
|
prev_size = sorted[i];
|
|
|
|
}
|
|
|
|
const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB
|
|
|
|
return std::min(kMaxPrefetchSize, max_qualified_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
|
|
|
|
|
|
|
|
static std::unordered_map<std::string, PinningTier>
|
|
|
|
pinning_tier_type_string_map = {
|
|
|
|
{"kFallback", PinningTier::kFallback},
|
|
|
|
{"kNone", PinningTier::kNone},
|
|
|
|
{"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
|
|
|
|
{"kAll", PinningTier::kAll}};
|
|
|
|
|
|
|
|
static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
|
|
|
|
block_base_table_index_type_string_map = {
|
|
|
|
{"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
|
|
|
|
{"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
|
|
|
|
{"kTwoLevelIndexSearch",
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
|
|
|
|
{"kBinarySearchWithFirstKey",
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
|
|
|
|
|
|
|
|
static std::unordered_map<std::string,
|
|
|
|
BlockBasedTableOptions::DataBlockIndexType>
|
|
|
|
block_base_table_data_block_index_type_string_map = {
|
|
|
|
{"kDataBlockBinarySearch",
|
|
|
|
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
|
|
|
|
{"kDataBlockBinaryAndHash",
|
|
|
|
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
|
|
|
|
|
|
|
|
static std::unordered_map<std::string,
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode>
|
|
|
|
block_base_table_index_shortening_mode_string_map = {
|
|
|
|
{"kNoShortening",
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
|
|
|
|
{"kShortenSeparators",
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
|
|
|
|
{"kShortenSeparatorsAndSuccessor",
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::
|
|
|
|
kShortenSeparatorsAndSuccessor}};
|
|
|
|
|
|
|
|
static std::unordered_map<std::string, OptionTypeInfo>
|
|
|
|
metadata_cache_options_type_info = {
|
|
|
|
{"top_level_index_pinning",
|
|
|
|
OptionTypeInfo::Enum<PinningTier>(
|
|
|
|
offsetof(struct MetadataCacheOptions, top_level_index_pinning),
|
|
|
|
&pinning_tier_type_string_map)},
|
|
|
|
{"partition_pinning",
|
|
|
|
OptionTypeInfo::Enum<PinningTier>(
|
|
|
|
offsetof(struct MetadataCacheOptions, partition_pinning),
|
|
|
|
&pinning_tier_type_string_map)},
|
|
|
|
{"unpartitioned_pinning",
|
|
|
|
OptionTypeInfo::Enum<PinningTier>(
|
|
|
|
offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
|
|
|
|
&pinning_tier_type_string_map)}};
|
|
|
|
|
|
|
|
static std::unordered_map<std::string,
|
|
|
|
BlockBasedTableOptions::PrepopulateBlockCache>
|
|
|
|
block_base_table_prepopulate_block_cache_string_map = {
|
|
|
|
{"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
|
|
|
|
{"kFlushOnly",
|
|
|
|
BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
|
|
|
|
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
|
|
|
static std::unordered_map<std::string, OptionTypeInfo>
|
|
|
|
block_based_table_type_info = {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
/* currently not supported
|
|
|
|
std::shared_ptr<Cache> block_cache = nullptr;
|
|
|
|
std::shared_ptr<Cache> block_cache_compressed = nullptr;
|
|
|
|
CacheUsageOptions cache_usage_options;
|
|
|
|
*/
|
|
|
|
{"flush_block_policy_factory",
|
|
|
|
OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
|
|
|
|
offsetof(struct BlockBasedTableOptions,
|
|
|
|
flush_block_policy_factory),
|
|
|
|
OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
|
|
|
|
{"cache_index_and_filter_blocks",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
cache_index_and_filter_blocks),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"cache_index_and_filter_blocks_with_high_priority",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
cache_index_and_filter_blocks_with_high_priority),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"pin_l0_filter_and_index_blocks_in_cache",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
pin_l0_filter_and_index_blocks_in_cache),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, index_type),
|
|
|
|
&block_base_table_index_type_string_map)},
|
|
|
|
{"hash_index_allow_collision",
|
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"data_block_index_type",
|
|
|
|
OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, data_block_index_type),
|
|
|
|
&block_base_table_data_block_index_type_string_map)},
|
|
|
|
{"index_shortening",
|
|
|
|
OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, index_shortening),
|
|
|
|
&block_base_table_index_shortening_mode_string_map)},
|
|
|
|
{"data_block_hash_table_util_ratio",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
data_block_hash_table_util_ratio),
|
|
|
|
OptionType::kDouble, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"checksum",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, checksum),
|
|
|
|
OptionType::kChecksumType, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"no_block_cache",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, no_block_cache),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"block_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_size),
|
|
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
|
|
|
{"block_size_deviation",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_size_deviation),
|
|
|
|
OptionType::kInt, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"block_restart_interval",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_restart_interval),
|
|
|
|
OptionType::kInt, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
|
|
|
{"index_block_restart_interval",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
|
|
|
|
OptionType::kInt, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"index_per_partition",
|
|
|
|
{0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"metadata_block_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, metadata_block_size),
|
|
|
|
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"partition_filters",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, partition_filters),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
Minimize memory internal fragmentation for Bloom filters (#6427)
Summary:
New experimental option BBTO::optimize_filters_for_memory builds
filters that maximize their use of "usable size" from malloc_usable_size,
which is also used to compute block cache charges.
Rather than always "rounding up," we track state in the
BloomFilterPolicy object to mix essentially "rounding down" and
"rounding up" so that the average FP rate of all generated filters is
the same as without the option. (YMMV as heavily accessed filters might
be unluckily lower accuracy.)
Thus, the option near-minimizes what the block cache considers as
"memory used" for a given target Bloom filter false positive rate and
Bloom filter implementation. There are no forward or backward
compatibility issues with this change, though it only works on the
format_version=5 Bloom filter.
With Jemalloc, we see about 10% reduction in memory footprint (and block
cache charge) for Bloom filters, but 1-2% increase in storage footprint,
due to encoding efficiency losses (FP rate is non-linear with bits/key).
Why not weighted random round up/down rather than state tracking? By
only requiring malloc_usable_size, we don't actually know what the next
larger and next smaller usable sizes for the allocator are. We pick a
requested size, accept and use whatever usable size it has, and use the
difference to inform our next choice. This allows us to narrow in on the
right balance without tracking/predicting usable sizes.
Why not weight history of generated filter false positive rates by
number of keys? This could lead to excess skew in small filters after
generating a large filter.
Results from filter_bench with jemalloc (irrelevant details omitted):
(normal keys/filter, but high variance)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=30000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.6278
Number of filters: 5516
Total size (MB): 200.046
Reported total allocated memory (MB): 220.597
Reported internal fragmentation: 10.2732%
Bits/key stored: 10.0097
Average FP rate %: 0.965228
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=30000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 30.5104
Number of filters: 5464
Total size (MB): 200.015
Reported total allocated memory (MB): 200.322
Reported internal fragmentation: 0.153709%
Bits/key stored: 10.1011
Average FP rate %: 0.966313
(very few keys / filter, optimization not as effective due to ~59 byte
internal fragmentation in blocked Bloom filter representation)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.5649
Number of filters: 162950
Total size (MB): 200.001
Reported total allocated memory (MB): 224.624
Reported internal fragmentation: 12.3117%
Bits/key stored: 10.2951
Average FP rate %: 0.821534
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 31.8057
Number of filters: 159849
Total size (MB): 200
Reported total allocated memory (MB): 208.846
Reported internal fragmentation: 4.42297%
Bits/key stored: 10.4948
Average FP rate %: 0.811006
(high keys/filter)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.7017
Number of filters: 164
Total size (MB): 200.352
Reported total allocated memory (MB): 221.5
Reported internal fragmentation: 10.5552%
Bits/key stored: 10.0003
Average FP rate %: 0.969358
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 30.7131
Number of filters: 160
Total size (MB): 200.928
Reported total allocated memory (MB): 200.938
Reported internal fragmentation: 0.00448054%
Bits/key stored: 10.1852
Average FP rate %: 0.963387
And from db_bench (block cache) with jemalloc:
$ ./db_bench -db=/dev/shm/dbbench.no_optimize -benchmarks=fillrandom -format_version=5 -value_size=90 -bloom_bits=10 -num=2000000 -threads=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false
$ ./db_bench -db=/dev/shm/dbbench -benchmarks=fillrandom -format_version=5 -value_size=90 -bloom_bits=10 -num=2000000 -threads=8 -optimize_filters_for_memory -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false
$ (for FILE in /dev/shm/dbbench.no_optimize/*.sst; do ./sst_dump --file=$FILE --show_properties | grep 'filter block' ; done) | awk '{ t += $4; } END { print t; }'
17063835
$ (for FILE in /dev/shm/dbbench/*.sst; do ./sst_dump --file=$FILE --show_properties | grep 'filter block' ; done) | awk '{ t += $4; } END { print t; }'
17430747
$ #^ 2.1% additional filter storage
$ ./db_bench -db=/dev/shm/dbbench.no_optimize -use_existing_db -benchmarks=readrandom,stats -statistics -bloom_bits=10 -num=2000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false -duration=10 -cache_index_and_filter_blocks -cache_size=1000000000
rocksdb.block.cache.index.add COUNT : 33
rocksdb.block.cache.index.bytes.insert COUNT : 8440400
rocksdb.block.cache.filter.add COUNT : 33
rocksdb.block.cache.filter.bytes.insert COUNT : 21087528
rocksdb.bloom.filter.useful COUNT : 4963889
rocksdb.bloom.filter.full.positive COUNT : 1214081
rocksdb.bloom.filter.full.true.positive COUNT : 1161999
$ #^ 1.04 % observed FP rate
$ ./db_bench -db=/dev/shm/dbbench -use_existing_db -benchmarks=readrandom,stats -statistics -bloom_bits=10 -num=2000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false -optimize_filters_for_memory -duration=10 -cache_index_and_filter_blocks -cache_size=1000000000
rocksdb.block.cache.index.add COUNT : 33
rocksdb.block.cache.index.bytes.insert COUNT : 8448592
rocksdb.block.cache.filter.add COUNT : 33
rocksdb.block.cache.filter.bytes.insert COUNT : 18220328
rocksdb.bloom.filter.useful COUNT : 5360933
rocksdb.bloom.filter.full.positive COUNT : 1321315
rocksdb.bloom.filter.full.true.positive COUNT : 1262999
$ #^ 1.08 % observed FP rate, 13.6% less memory usage for filters
(Due to specific key density, this example tends to generate filters that are "worse than average" for internal fragmentation. "Better than average" cases can show little or no improvement.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6427
Test Plan: unit test added, 'make check' with gcc, clang and valgrind
Reviewed By: siying
Differential Revision: D22124374
Pulled By: pdillinger
fbshipit-source-id: f3e3aa152f9043ddf4fae25799e76341d0d8714e
5 years ago
|
|
|
{"optimize_filters_for_memory",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"filter_policy",
|
|
|
|
OptionTypeInfo::AsCustomSharedPtr<const FilterPolicy>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, filter_policy),
|
|
|
|
OptionVerificationType::kByNameAllowFromNull,
|
|
|
|
OptionTypeFlags::kNone)},
|
|
|
|
{"whole_key_filtering",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, whole_key_filtering),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
Detect (new) Bloom/Ribbon Filter construction corruption (#9342)
Summary:
Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393
**Context:**
(Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following:
a) set of keys to add to filter
b) set of hashes to add to filter (64-bit hash applied to each key)
c) set of Bloom indices to set in filter, with duplicates
d) set of Bloom indices to set in filter, deduplicated
e) final filter and its checksum
This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level.
- b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`)
- c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO.
Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default.
**Summary:**
- Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()`
- Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption
- See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design
- Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries`
- Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()`
- When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342
Test Plan:
- Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption`
- Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl
- For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break.
- Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()`
- FastLocalBloom
- Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s**
- After change:
- `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)**
- Standard128Ribbon
- Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s**
- After change:
- `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)**
- Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true`
- Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status.
Reviewed By: pdillinger
Differential Revision: D33746928
Pulled By: hx235
fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
3 years ago
|
|
|
{"detect_filter_construct_corruption",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
detect_filter_construct_corruption),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
|
|
|
{"reserve_table_builder_memory",
|
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"reserve_table_reader_memory",
|
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"skip_table_builder_flush",
|
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"format_version",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, format_version),
|
|
|
|
OptionType::kUInt32T, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"verify_compression",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, verify_compression),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"read_amp_bytes_per_bit",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
|
|
|
|
OptionType::kUInt32T, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone,
|
|
|
|
[](const ConfigOptions& /*opts*/, const std::string& /*name*/,
|
|
|
|
const std::string& value, void* addr) {
|
|
|
|
// A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
|
|
|
|
// and 6.14. The bug will write out 8 bytes to OPTIONS file from the
|
|
|
|
// starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
|
|
|
|
// which is actually a uint32. Consequently, the value of
|
|
|
|
// read_amp_bytes_per_bit written in the OPTIONS file is wrong.
|
|
|
|
// From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
|
|
|
|
// from OPTIONS file as a uint32. To be able to load OPTIONS file
|
|
|
|
// generated by affected releases before the fix, we need to
|
|
|
|
// manually parse read_amp_bytes_per_bit with this special hack.
|
|
|
|
uint64_t read_amp_bytes_per_bit = ParseUint64(value);
|
|
|
|
*(static_cast<uint32_t*>(addr)) =
|
|
|
|
static_cast<uint32_t>(read_amp_bytes_per_bit);
|
|
|
|
return Status::OK();
|
|
|
|
}}},
|
|
|
|
{"enable_index_compression",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, enable_index_compression),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"block_align",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_align),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"pin_top_level_index_and_filter",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
pin_top_level_index_and_filter),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{kOptNameMetadataCacheOpts,
|
|
|
|
OptionTypeInfo::Struct(
|
|
|
|
kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
|
|
|
|
offsetof(struct BlockBasedTableOptions, metadata_cache_options),
|
|
|
|
OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
|
|
|
|
{"block_cache",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_cache),
|
|
|
|
OptionType::kUnknown, OptionVerificationType::kNormal,
|
|
|
|
(OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
|
|
|
|
// Parses the input value as a Cache
|
|
|
|
[](const ConfigOptions& opts, const std::string&,
|
|
|
|
const std::string& value, void* addr) {
|
|
|
|
auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
|
|
|
|
return Cache::CreateFromString(opts, value, cache);
|
|
|
|
}}},
|
|
|
|
{"block_cache_compressed",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_cache_compressed),
|
|
|
|
OptionType::kUnknown, OptionVerificationType::kNormal,
|
|
|
|
(OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
|
|
|
|
// Parses the input value as a Cache
|
|
|
|
[](const ConfigOptions& opts, const std::string&,
|
|
|
|
const std::string& value, void* addr) {
|
|
|
|
auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
|
|
|
|
return Cache::CreateFromString(opts, value, cache);
|
|
|
|
}}},
|
|
|
|
{"max_auto_readahead_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
|
|
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
|
|
|
{"prepopulate_block_cache",
|
|
|
|
OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
|
|
|
|
&block_base_table_prepopulate_block_cache_string_map,
|
|
|
|
OptionTypeFlags::kMutable)},
|
|
|
|
{"initial_auto_readahead_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size),
|
|
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
|
|
|
{"num_file_reads_for_auto_readahead",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
num_file_reads_for_auto_readahead),
|
|
|
|
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
|
|
|
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
};
|
|
|
|
|
|
|
|
// TODO(myabandeh): We should return an error instead of silently changing the
|
|
|
|
// options
|
|
|
|
BlockBasedTableFactory::BlockBasedTableFactory(
|
|
|
|
const BlockBasedTableOptions& _table_options)
|
|
|
|
: table_options_(_table_options) {
|
|
|
|
InitializeOptions();
|
|
|
|
RegisterOptions(&table_options_, &block_based_table_type_info);
|
|
|
|
|
|
|
|
const auto table_reader_charged =
|
|
|
|
table_options_.cache_usage_options.options_overrides
|
|
|
|
.at(CacheEntryRole::kBlockBasedTableReader)
|
|
|
|
.charged;
|
|
|
|
if (table_options_.block_cache &&
|
|
|
|
table_reader_charged == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
table_reader_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
|
|
|
|
std::make_shared<CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kBlockBasedTableReader>>(
|
|
|
|
table_options_.block_cache)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTableFactory::InitializeOptions() {
|
|
|
|
if (table_options_.flush_block_policy_factory == nullptr) {
|
|
|
|
table_options_.flush_block_policy_factory.reset(
|
|
|
|
new FlushBlockBySizePolicyFactory());
|
|
|
|
}
|
|
|
|
if (table_options_.no_block_cache) {
|
|
|
|
table_options_.block_cache.reset();
|
|
|
|
} else if (table_options_.block_cache == nullptr) {
|
|
|
|
LRUCacheOptions co;
|
|
|
|
co.capacity = 8 << 20;
|
|
|
|
// It makes little sense to pay overhead for mid-point insertion while the
|
|
|
|
// block size is only 8MB.
|
|
|
|
co.high_pri_pool_ratio = 0.0;
|
|
|
|
co.low_pri_pool_ratio = 0.0;
|
|
|
|
table_options_.block_cache = NewLRUCache(co);
|
|
|
|
}
|
|
|
|
if (table_options_.block_size_deviation < 0 ||
|
|
|
|
table_options_.block_size_deviation > 100) {
|
|
|
|
table_options_.block_size_deviation = 0;
|
|
|
|
}
|
|
|
|
if (table_options_.block_restart_interval < 1) {
|
|
|
|
table_options_.block_restart_interval = 1;
|
|
|
|
}
|
|
|
|
if (table_options_.index_block_restart_interval < 1) {
|
|
|
|
table_options_.index_block_restart_interval = 1;
|
|
|
|
}
|
|
|
|
if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
|
|
|
|
table_options_.index_block_restart_interval != 1) {
|
|
|
|
// Currently kHashSearch is incompatible with index_block_restart_interval > 1
|
|
|
|
table_options_.index_block_restart_interval = 1;
|
|
|
|
}
|
|
|
|
if (table_options_.partition_filters &&
|
|
|
|
table_options_.index_type !=
|
|
|
|
BlockBasedTableOptions::kTwoLevelIndexSearch) {
|
|
|
|
// We do not support partitioned filters without partitioning indexes
|
|
|
|
table_options_.partition_filters = false;
|
|
|
|
}
|
|
|
|
auto& options_overrides =
|
|
|
|
table_options_.cache_usage_options.options_overrides;
|
|
|
|
const auto options = table_options_.cache_usage_options.options;
|
|
|
|
for (std::uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
|
|
|
|
CacheEntryRole role = static_cast<CacheEntryRole>(i);
|
|
|
|
auto options_overrides_iter = options_overrides.find(role);
|
|
|
|
if (options_overrides_iter == options_overrides.end()) {
|
|
|
|
options_overrides.insert({role, options});
|
|
|
|
} else if (options_overrides_iter->second.charged ==
|
|
|
|
CacheEntryRoleOptions::Decision::kFallback) {
|
|
|
|
options_overrides_iter->second.charged = options.charged;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
|
|
|
|
InitializeOptions();
|
|
|
|
return TableFactory::PrepareOptions(opts);
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
// Different cache kinds use the same keys for physically different values, so
|
|
|
|
// they must not share an underlying key space with each other.
|
|
|
|
Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
|
|
|
|
int cache_count = (bbto.block_cache != nullptr) +
|
|
|
|
(bbto.block_cache_compressed != nullptr) +
|
|
|
|
(bbto.persistent_cache != nullptr);
|
|
|
|
if (cache_count <= 1) {
|
|
|
|
// Nothing to share / overlap
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Simple pointer equality
|
|
|
|
if (bbto.block_cache == bbto.block_cache_compressed) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block_cache same as block_cache_compressed not currently supported, "
|
|
|
|
"and would be bad for performance anyway");
|
|
|
|
}
|
|
|
|
|
|
|
|
// More complex test of shared key space, in case the instances are wrappers
|
|
|
|
// for some shared underlying cache.
|
|
|
|
CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime();
|
|
|
|
static char kRegularBlockCacheMarker = 'b';
|
|
|
|
static char kCompressedBlockCacheMarker = 'c';
|
|
|
|
static char kPersistentCacheMarker = 'p';
|
|
|
|
if (bbto.block_cache) {
|
|
|
|
bbto.block_cache
|
|
|
|
->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, 1,
|
|
|
|
GetNoopDeleterForRole<CacheEntryRole::kMisc>())
|
|
|
|
.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
if (bbto.block_cache_compressed) {
|
|
|
|
bbto.block_cache_compressed
|
|
|
|
->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, 1,
|
|
|
|
GetNoopDeleterForRole<CacheEntryRole::kMisc>())
|
|
|
|
.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
if (bbto.persistent_cache) {
|
|
|
|
// Note: persistent cache copies the data, not keeping the pointer
|
|
|
|
bbto.persistent_cache
|
|
|
|
->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1)
|
|
|
|
.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
// If we get something different from what we inserted, that indicates
|
|
|
|
// dangerously overlapping key spaces.
|
|
|
|
if (bbto.block_cache) {
|
|
|
|
auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice());
|
|
|
|
if (handle) {
|
|
|
|
auto v = static_cast<char*>(bbto.block_cache->Value(handle));
|
|
|
|
char c = *v;
|
|
|
|
bbto.block_cache->Release(handle);
|
|
|
|
if (v == &kCompressedBlockCacheMarker) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block_cache and block_cache_compressed share the same key space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (c == kPersistentCacheMarker) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block_cache and persistent_cache share the same key space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (v != &kRegularBlockCacheMarker) {
|
|
|
|
return Status::Corruption("Unexpected mutation to block_cache");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (bbto.block_cache_compressed) {
|
|
|
|
auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice());
|
|
|
|
if (handle) {
|
|
|
|
auto v = static_cast<char*>(bbto.block_cache_compressed->Value(handle));
|
|
|
|
char c = *v;
|
|
|
|
bbto.block_cache_compressed->Release(handle);
|
|
|
|
if (v == &kRegularBlockCacheMarker) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block_cache_compressed and block_cache share the same key space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (c == kPersistentCacheMarker) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block_cache_compressed and persistent_cache share the same key "
|
|
|
|
"space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (v != &kCompressedBlockCacheMarker) {
|
|
|
|
return Status::Corruption(
|
|
|
|
"Unexpected mutation to block_cache_compressed");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (bbto.persistent_cache) {
|
|
|
|
std::unique_ptr<char[]> data;
|
|
|
|
size_t size = 0;
|
|
|
|
bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size)
|
|
|
|
.PermitUncheckedError();
|
|
|
|
if (data && size > 0) {
|
|
|
|
if (data[0] == kRegularBlockCacheMarker) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"persistent_cache and block_cache share the same key space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (data[0] == kCompressedBlockCacheMarker) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"persistent_cache and block_cache_compressed share the same key "
|
|
|
|
"space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (data[0] != kPersistentCacheMarker) {
|
|
|
|
return Status::Corruption("Unexpected mutation to persistent_cache");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
Status BlockBasedTableFactory::NewTableReader(
|
|
|
|
const ReadOptions& ro, const TableReaderOptions& table_reader_options,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
|
|
bool prefetch_index_and_filter_in_cache) const {
|
|
|
|
return BlockBasedTable::Open(
|
|
|
|
ro, table_reader_options.ioptions, table_reader_options.env_options,
|
|
|
|
table_options_, table_reader_options.internal_comparator, std::move(file),
|
|
|
|
file_size, table_reader, table_reader_cache_res_mgr_,
|
|
|
|
table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
|
|
|
|
table_reader_options.skip_filters, table_reader_options.level,
|
|
|
|
table_reader_options.immortal, table_reader_options.largest_seqno,
|
|
|
|
table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
|
|
|
|
table_reader_options.block_cache_tracer,
|
|
|
|
table_reader_options.max_file_size_for_l0_meta_pin,
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2 years ago
|
|
|
table_reader_options.cur_db_session_id, table_reader_options.cur_file_num,
|
|
|
|
table_reader_options.unique_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
TableBuilder* BlockBasedTableFactory::NewTableBuilder(
|
|
|
|
const TableBuilderOptions& table_builder_options,
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
10 years ago
|
|
|
WritableFileWriter* file) const {
|
|
|
|
return new BlockBasedTableBuilder(table_options_, table_builder_options,
|
|
|
|
file);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlockBasedTableFactory::ValidateOptions(
|
|
|
|
const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
|
|
|
|
if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
|
|
|
|
cf_opts.prefix_extractor == nullptr) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Hash index is specified for block-based "
|
|
|
|
"table, but prefix_extractor is not given");
|
|
|
|
}
|
|
|
|
if (table_options_.cache_index_and_filter_blocks &&
|
|
|
|
table_options_.no_block_cache) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable cache_index_and_filter_blocks, "
|
|
|
|
", but block cache is disabled");
|
|
|
|
}
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
|
|
|
|
table_options_.no_block_cache) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable pin_l0_filter_and_index_blocks_in_cache, "
|
|
|
|
", but block cache is disabled");
|
|
|
|
}
|
|
|
|
if (!IsSupportedFormatVersion(table_options_.format_version)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Unsupported BlockBasedTable format_version. Please check "
|
|
|
|
"include/rocksdb/table.h for more info");
|
|
|
|
}
|
|
|
|
if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable block_align, but compression "
|
|
|
|
"enabled");
|
|
|
|
}
|
|
|
|
if (table_options_.block_align &&
|
|
|
|
(table_options_.block_size & (table_options_.block_size - 1))) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Block alignment requested but block size is not a power of 2");
|
|
|
|
}
|
|
|
|
if (table_options_.block_size > std::numeric_limits<uint32_t>::max()) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block size exceeds maximum number (4GiB) allowed");
|
|
|
|
}
|
|
|
|
if (table_options_.data_block_index_type ==
|
|
|
|
BlockBasedTableOptions::kDataBlockBinaryAndHash &&
|
|
|
|
table_options_.data_block_hash_table_util_ratio <= 0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"data_block_hash_table_util_ratio should be greater than 0 when "
|
|
|
|
"data_block_index_type is set to kDataBlockBinaryAndHash");
|
|
|
|
}
|
|
|
|
if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
|
|
|
|
// TODO(myabandeh): support it
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"max_successive_merges larger than 0 is currently inconsistent with "
|
|
|
|
"unordered_write");
|
|
|
|
}
|
|
|
|
const auto& options_overrides =
|
|
|
|
table_options_.cache_usage_options.options_overrides;
|
|
|
|
for (auto options_overrides_iter = options_overrides.cbegin();
|
|
|
|
options_overrides_iter != options_overrides.cend();
|
|
|
|
++options_overrides_iter) {
|
|
|
|
const CacheEntryRole role = options_overrides_iter->first;
|
|
|
|
const CacheEntryRoleOptions options = options_overrides_iter->second;
|
|
|
|
static const std::set<CacheEntryRole> kMemoryChargingSupported = {
|
|
|
|
CacheEntryRole::kCompressionDictionaryBuildingBuffer,
|
|
|
|
CacheEntryRole::kFilterConstruction,
|
|
|
|
CacheEntryRole::kBlockBasedTableReader, CacheEntryRole::kFileMetadata,
|
|
|
|
CacheEntryRole::kBlobCache};
|
|
|
|
if (options.charged != CacheEntryRoleOptions::Decision::kFallback &&
|
|
|
|
kMemoryChargingSupported.count(role) == 0) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Enable/Disable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" is not supported");
|
|
|
|
}
|
|
|
|
if (table_options_.no_block_cache &&
|
|
|
|
options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but block cache is disabled");
|
|
|
|
}
|
|
|
|
if (role == CacheEntryRole::kBlobCache &&
|
|
|
|
options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
if (cf_opts.blob_cache == nullptr) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but blob cache is not configured");
|
|
|
|
}
|
|
|
|
if (table_options_.no_block_cache) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but block cache is disabled");
|
|
|
|
}
|
|
|
|
if (table_options_.block_cache == cf_opts.blob_cache) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but blob cache is the same as block cache");
|
|
|
|
}
|
|
|
|
if (cf_opts.blob_cache->GetCapacity() >
|
|
|
|
table_options_.block_cache->GetCapacity()) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but blob cache capacity is larger than block cache capacity");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
Status s = CheckCacheOptionCompatibility(table_options_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
std::string garbage;
|
|
|
|
if (!SerializeEnum<ChecksumType>(checksum_type_string_map,
|
|
|
|
table_options_.checksum, &garbage)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Unrecognized ChecksumType for checksum: " +
|
|
|
|
std::to_string(static_cast<uint32_t>(table_options_.checksum)));
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
}
|
|
|
|
return TableFactory::ValidateOptions(db_opts, cf_opts);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string BlockBasedTableFactory::GetPrintableOptions() const {
|
|
|
|
std::string ret;
|
|
|
|
ret.reserve(20000);
|
|
|
|
const int kBufferSize = 200;
|
|
|
|
char buffer[kBufferSize];
|
|
|
|
|
|
|
|
snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n",
|
|
|
|
table_options_.flush_block_policy_factory->Name(),
|
|
|
|
static_cast<void*>(table_options_.flush_block_policy_factory.get()));
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n",
|
|
|
|
table_options_.cache_index_and_filter_blocks);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" cache_index_and_filter_blocks_with_high_priority: %d\n",
|
|
|
|
table_options_.cache_index_and_filter_blocks_with_high_priority);
|
|
|
|
ret.append(buffer);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" pin_l0_filter_and_index_blocks_in_cache: %d\n",
|
|
|
|
table_options_.pin_l0_filter_and_index_blocks_in_cache);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n",
|
|
|
|
table_options_.pin_top_level_index_and_filter);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " index_type: %d\n",
|
|
|
|
table_options_.index_type);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " data_block_index_type: %d\n",
|
|
|
|
table_options_.data_block_index_type);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " index_shortening: %d\n",
|
|
|
|
static_cast<int>(table_options_.index_shortening));
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n",
|
|
|
|
table_options_.data_block_hash_table_util_ratio);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " no_block_cache: %d\n",
|
|
|
|
table_options_.no_block_cache);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " block_cache: %p\n",
|
|
|
|
static_cast<void*>(table_options_.block_cache.get()));
|
|
|
|
ret.append(buffer);
|
|
|
|
if (table_options_.block_cache) {
|
|
|
|
const char* block_cache_name = table_options_.block_cache->Name();
|
|
|
|
if (block_cache_name != nullptr) {
|
|
|
|
snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
|
|
|
|
block_cache_name);
|
|
|
|
ret.append(buffer);
|
|
|
|
}
|
|
|
|
ret.append(" block_cache_options:\n");
|
|
|
|
ret.append(table_options_.block_cache->GetPrintableOptions());
|
|
|
|
}
|
|
|
|
snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n",
|
|
|
|
static_cast<void*>(table_options_.block_cache_compressed.get()));
|
|
|
|
ret.append(buffer);
|
|
|
|
if (table_options_.block_cache_compressed) {
|
|
|
|
const char* block_cache_compressed_name =
|
|
|
|
table_options_.block_cache_compressed->Name();
|
|
|
|
if (block_cache_compressed_name != nullptr) {
|
|
|
|
snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
|
|
|
|
block_cache_compressed_name);
|
|
|
|
ret.append(buffer);
|
|
|
|
}
|
|
|
|
ret.append(" block_cache_compressed_options:\n");
|
|
|
|
ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
|
|
|
|
}
|
|
|
|
snprintf(buffer, kBufferSize, " persistent_cache: %p\n",
|
|
|
|
static_cast<void*>(table_options_.persistent_cache.get()));
|
|
|
|
ret.append(buffer);
|
|
|
|
if (table_options_.persistent_cache) {
|
|
|
|
snprintf(buffer, kBufferSize, " persistent_cache_options:\n");
|
|
|
|
ret.append(buffer);
|
|
|
|
ret.append(table_options_.persistent_cache->GetPrintableOptions());
|
|
|
|
}
|
|
|
|
snprintf(buffer, kBufferSize, " block_size: %" PRIu64 "\n",
|
|
|
|
table_options_.block_size);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " block_size_deviation: %d\n",
|
|
|
|
table_options_.block_size_deviation);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " block_restart_interval: %d\n",
|
|
|
|
table_options_.block_restart_interval);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n",
|
|
|
|
table_options_.index_block_restart_interval);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n",
|
|
|
|
table_options_.metadata_block_size);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " partition_filters: %d\n",
|
|
|
|
table_options_.partition_filters);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n",
|
|
|
|
table_options_.use_delta_encoding);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " filter_policy: %s\n",
|
|
|
|
table_options_.filter_policy == nullptr
|
|
|
|
? "nullptr"
|
|
|
|
: table_options_.filter_policy->Name());
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n",
|
|
|
|
table_options_.whole_key_filtering);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " verify_compression: %d\n",
|
|
|
|
table_options_.verify_compression);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n",
|
|
|
|
table_options_.read_amp_bytes_per_bit);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " format_version: %d\n",
|
|
|
|
table_options_.format_version);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " enable_index_compression: %d\n",
|
|
|
|
table_options_.enable_index_compression);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " block_align: %d\n",
|
|
|
|
table_options_.block_align);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
|
|
|
|
table_options_.max_auto_readahead_size);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n",
|
|
|
|
static_cast<int>(table_options_.prepopulate_block_cache));
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" initial_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
|
|
|
|
table_options_.initial_auto_readahead_size);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" num_file_reads_for_auto_readahead: %" PRIu64 "\n",
|
|
|
|
table_options_.num_file_reads_for_auto_readahead);
|
|
|
|
ret.append(buffer);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
const void* BlockBasedTableFactory::GetOptionsPtr(
|
|
|
|
const std::string& name) const {
|
|
|
|
if (name == kBlockCacheOpts()) {
|
|
|
|
if (table_options_.no_block_cache) {
|
|
|
|
return nullptr;
|
|
|
|
} else {
|
|
|
|
return table_options_.block_cache.get();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return TableFactory::GetOptionsPtr(name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// Take a default BlockBasedTableOptions "table_options" in addition to a
|
|
|
|
// map "opts_map" of option name to option value to construct the new
|
|
|
|
// BlockBasedTableOptions "new_table_options".
|
|
|
|
//
|
|
|
|
// Below are the instructions of how to config some non-primitive-typed
|
|
|
|
// options in BlockBasedTableOptions:
|
|
|
|
//
|
|
|
|
// * filter_policy:
|
|
|
|
// We currently only support the following FilterPolicy in the convenience
|
|
|
|
// functions:
|
|
|
|
// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
|
|
|
|
// to specify BloomFilter. The above string is equivalent to calling
|
|
|
|
// NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
|
|
|
|
// [Example]:
|
|
|
|
// - Pass {"filter_policy", "bloomfilter:4:true"} in
|
|
|
|
// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
|
|
|
|
// per key and use_block_based_builder enabled.
|
|
|
|
//
|
|
|
|
// * block_cache / block_cache_compressed:
|
|
|
|
// We currently only support LRU cache in the GetOptions API. The LRU
|
|
|
|
// cache can be set by directly specifying its size.
|
|
|
|
// [Example]:
|
|
|
|
// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
|
|
|
|
// equivalent to setting block_cache using NewLRUCache(1024 * 1024).
|
|
|
|
//
|
|
|
|
// @param table_options the default options of the output "new_table_options".
|
|
|
|
// @param opts_map an option name to value map for specifying how
|
|
|
|
// "new_table_options" should be set.
|
|
|
|
// @param new_table_options the resulting options based on "table_options"
|
|
|
|
// with the change specified in "opts_map".
|
|
|
|
// @param input_strings_escaped when set to true, each escaped characters
|
|
|
|
// prefixed by '\' in the values of the opts_map will be further converted
|
|
|
|
// back to the raw string before assigning to the associated options.
|
|
|
|
// @param ignore_unknown_options when set to true, unknown options are ignored
|
|
|
|
// instead of resulting in an unknown-option error.
|
|
|
|
// @return Status::OK() on success. Otherwise, a non-ok status indicating
|
|
|
|
// error will be returned, and "new_table_options" will be set to
|
|
|
|
// "table_options".
|
|
|
|
Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
|
|
|
|
const OptionTypeInfo& opt_info,
|
|
|
|
const std::string& opt_name,
|
|
|
|
const std::string& opt_value,
|
|
|
|
void* opt_ptr) {
|
|
|
|
Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
|
|
|
|
opt_value, opt_ptr);
|
|
|
|
if (config_options.input_strings_escaped && !status.ok()) { // Got an error
|
|
|
|
// !input_strings_escaped indicates the old API, where everything is
|
|
|
|
// parsable.
|
|
|
|
if (opt_info.IsByName()) {
|
|
|
|
status = Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status GetBlockBasedTableOptionsFromString(
|
|
|
|
const BlockBasedTableOptions& table_options, const std::string& opts_str,
|
|
|
|
BlockBasedTableOptions* new_table_options) {
|
|
|
|
ConfigOptions config_options;
|
|
|
|
config_options.input_strings_escaped = false;
|
|
|
|
config_options.ignore_unknown_options = false;
|
|
|
|
config_options.invoke_prepare_options = false;
|
|
|
|
config_options.ignore_unsupported_options = false;
|
|
|
|
|
|
|
|
return GetBlockBasedTableOptionsFromString(config_options, table_options,
|
|
|
|
opts_str, new_table_options);
|
|
|
|
}
|
|
|
|
Status GetBlockBasedTableOptionsFromString(
|
|
|
|
const ConfigOptions& config_options,
|
|
|
|
const BlockBasedTableOptions& table_options, const std::string& opts_str,
|
|
|
|
BlockBasedTableOptions* new_table_options) {
|
|
|
|
std::unordered_map<std::string, std::string> opts_map;
|
|
|
|
Status s = StringToMap(opts_str, &opts_map);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
|
|
|
|
new_table_options);
|
|
|
|
// Translate any errors (NotFound, NotSupported, to InvalidArgument
|
|
|
|
if (s.ok() || s.IsInvalidArgument()) {
|
|
|
|
return s;
|
|
|
|
} else {
|
|
|
|
return Status::InvalidArgument(s.getState());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status GetBlockBasedTableOptionsFromMap(
|
|
|
|
const BlockBasedTableOptions& table_options,
|
|
|
|
const std::unordered_map<std::string, std::string>& opts_map,
|
|
|
|
BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
|
|
|
|
bool ignore_unknown_options) {
|
|
|
|
ConfigOptions config_options;
|
|
|
|
config_options.input_strings_escaped = input_strings_escaped;
|
|
|
|
config_options.ignore_unknown_options = ignore_unknown_options;
|
|
|
|
config_options.invoke_prepare_options = false;
|
|
|
|
|
|
|
|
return GetBlockBasedTableOptionsFromMap(config_options, table_options,
|
|
|
|
opts_map, new_table_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status GetBlockBasedTableOptionsFromMap(
|
|
|
|
const ConfigOptions& config_options,
|
|
|
|
const BlockBasedTableOptions& table_options,
|
|
|
|
const std::unordered_map<std::string, std::string>& opts_map,
|
|
|
|
BlockBasedTableOptions* new_table_options) {
|
|
|
|
assert(new_table_options);
|
|
|
|
BlockBasedTableFactory bbtf(table_options);
|
|
|
|
Status s = bbtf.ConfigureFromMap(config_options, opts_map);
|
|
|
|
if (s.ok()) {
|
|
|
|
*new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
|
|
|
|
} else {
|
|
|
|
*new_table_options = table_options;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
TableFactory* NewBlockBasedTableFactory(
|
|
|
|
const BlockBasedTableOptions& _table_options) {
|
|
|
|
return new BlockBasedTableFactory(_table_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
const std::string BlockBasedTablePropertyNames::kIndexType =
|
|
|
|
"rocksdb.block.based.table.index.type";
|
|
|
|
const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
|
|
|
|
"rocksdb.block.based.table.whole.key.filtering";
|
|
|
|
const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
|
|
|
|
"rocksdb.block.based.table.prefix.filtering";
|
|
|
|
const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
|
|
|
|
const std::string kHashIndexPrefixesMetadataBlock =
|
|
|
|
"rocksdb.hashindex.metadata";
|
|
|
|
const std::string kPropTrue = "1";
|
|
|
|
const std::string kPropFalse = "0";
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|