fork of https://github.com/oxigraph/rocksdb and https://github.com/facebook/rocksdb for nextgraph and oxigraph
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
904 lines
40 KiB
904 lines
40 KiB
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
//
|
|
// Currently we support two types of tables: plain table and block-based table.
|
|
// 1. Block-based table: this is the default table type that we inherited from
|
|
// LevelDB, which was designed for storing data in hard disk or flash
|
|
// device.
|
|
// 2. Plain table: it is one of RocksDB's SST file format optimized
|
|
// for low query latency on pure-memory or really low-latency media.
|
|
//
|
|
// A tutorial of rocksdb table formats is available here:
|
|
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
|
|
//
|
|
// Example code is also available
|
|
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
|
|
|
|
#pragma once
|
|
|
|
#include <memory>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
|
|
#include "rocksdb/cache.h"
|
|
#include "rocksdb/customizable.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/status.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// -- Block-based Table
|
|
class Cache;
|
|
class FilterPolicy;
|
|
class FlushBlockPolicyFactory;
|
|
class PersistentCache;
|
|
class RandomAccessFile;
|
|
struct TableReaderOptions;
|
|
struct TableBuilderOptions;
|
|
class TableBuilder;
|
|
class TableFactory;
|
|
class TableReader;
|
|
class WritableFileWriter;
|
|
struct ConfigOptions;
|
|
struct EnvOptions;
|
|
|
|
// Types of checksums to use for checking integrity of logical blocks within
|
|
// files. All checksums currently use 32 bits of checking power (1 in 4B
|
|
// chance of failing to detect random corruption).
|
|
enum ChecksumType : char {
|
|
kNoChecksum = 0x0,
|
|
kCRC32c = 0x1,
|
|
kxxHash = 0x2,
|
|
kxxHash64 = 0x3,
|
|
kXXH3 = 0x4, // Supported since RocksDB 6.27
|
|
};
|
|
|
|
// `PinningTier` is used to specify which tier of block-based tables should
|
|
// be affected by a block cache pinning setting (see
|
|
// `MetadataCacheOptions` below).
|
|
enum class PinningTier {
|
|
// For compatibility, this value specifies to fallback to the behavior
|
|
// indicated by the deprecated options,
|
|
// `pin_l0_filter_and_index_blocks_in_cache` and
|
|
// `pin_top_level_index_and_filter`.
|
|
kFallback,
|
|
|
|
// This tier contains no block-based tables.
|
|
kNone,
|
|
|
|
// This tier contains block-based tables that may have originated from a
|
|
// memtable flush. In particular, it includes tables from L0 that are smaller
|
|
// than 1.5 times the current `write_buffer_size`. Note these criteria imply
|
|
// it can include intra-L0 compaction outputs and ingested files, as long as
|
|
// they are not abnormally large compared to flushed files in L0.
|
|
kFlushedAndSimilar,
|
|
|
|
// This tier contains all block-based tables.
|
|
kAll,
|
|
};
|
|
|
|
// `MetadataCacheOptions` contains members indicating the desired caching
|
|
// behavior for the different categories of metadata blocks.
|
|
struct MetadataCacheOptions {
|
|
// The tier of block-based tables whose top-level index into metadata
|
|
// partitions will be pinned. Currently indexes and filters may be
|
|
// partitioned.
|
|
//
|
|
// Note `cache_index_and_filter_blocks` must be true for this option to have
|
|
// any effect. Otherwise any top-level index into metadata partitions would be
|
|
// held in table reader memory, outside the block cache.
|
|
PinningTier top_level_index_pinning = PinningTier::kFallback;
|
|
|
|
// The tier of block-based tables whose metadata partitions will be pinned.
|
|
// Currently indexes and filters may be partitioned.
|
|
PinningTier partition_pinning = PinningTier::kFallback;
|
|
|
|
// The tier of block-based tables whose unpartitioned metadata blocks will be
|
|
// pinned.
|
|
//
|
|
// Note `cache_index_and_filter_blocks` must be true for this option to have
|
|
// any effect. Otherwise the unpartitioned meta-blocks would be held in table
|
|
// reader memory, outside the block cache.
|
|
PinningTier unpartitioned_pinning = PinningTier::kFallback;
|
|
};
|
|
|
|
struct CacheEntryRoleOptions {
|
|
enum class Decision {
|
|
kEnabled,
|
|
kDisabled,
|
|
kFallback,
|
|
};
|
|
Decision charged = Decision::kFallback;
|
|
bool operator==(const CacheEntryRoleOptions& other) const {
|
|
return charged == other.charged;
|
|
}
|
|
};
|
|
|
|
struct CacheUsageOptions {
|
|
CacheEntryRoleOptions options;
|
|
std::map<CacheEntryRole, CacheEntryRoleOptions> options_overrides;
|
|
};
|
|
|
|
// For advanced user only
|
|
struct BlockBasedTableOptions {
|
|
static const char* kName() { return "BlockTableOptions"; };
|
|
// @flush_block_policy_factory creates the instances of flush block policy.
|
|
// which provides a configurable way to determine when to flush a block in
|
|
// the block based tables. If not set, table builder will use the default
|
|
// block flush policy, which cut blocks by block size (please refer to
|
|
// `FlushBlockBySizePolicy`).
|
|
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
|
|
|
|
// TODO(kailiu) Temporarily disable this feature by making the default value
|
|
// to be false.
|
|
//
|
|
// TODO(ajkr) we need to update names of variables controlling meta-block
|
|
// caching as they should now apply to range tombstone and compression
|
|
// dictionary meta-blocks, in addition to index and filter meta-blocks.
|
|
//
|
|
// Whether to put index/filter blocks in the block cache. When false,
|
|
// each "table reader" object will pre-load index/filter blocks during
|
|
// table initialization. Index and filter partition blocks always use
|
|
// block cache regardless of this option.
|
|
bool cache_index_and_filter_blocks = false;
|
|
|
|
// If cache_index_and_filter_blocks is enabled, cache index and filter
|
|
// blocks with high priority. If set to true, depending on implementation of
|
|
// block cache, index and filter blocks may be less likely to be evicted
|
|
// than data blocks.
|
|
bool cache_index_and_filter_blocks_with_high_priority = true;
|
|
|
|
// DEPRECATED: This option will be removed in a future version. For now, this
|
|
// option still takes effect by updating each of the following variables that
|
|
// has the default value, `PinningTier::kFallback`:
|
|
//
|
|
// - `MetadataCacheOptions::partition_pinning`
|
|
// - `MetadataCacheOptions::unpartitioned_pinning`
|
|
//
|
|
// The updated value is chosen as follows:
|
|
//
|
|
// - `pin_l0_filter_and_index_blocks_in_cache == false` ->
|
|
// `PinningTier::kNone`
|
|
// - `pin_l0_filter_and_index_blocks_in_cache == true` ->
|
|
// `PinningTier::kFlushedAndSimilar`
|
|
//
|
|
// To migrate away from this flag, explicitly configure
|
|
// `MetadataCacheOptions` as described above.
|
|
//
|
|
// if cache_index_and_filter_blocks is true and the below is true, then
|
|
// filter and index blocks are stored in the cache, but a reference is
|
|
// held in the "table reader" object so the blocks are pinned and only
|
|
// evicted from cache when the table reader is freed.
|
|
bool pin_l0_filter_and_index_blocks_in_cache = false;
|
|
|
|
// DEPRECATED: This option will be removed in a future version. For now, this
|
|
// option still takes effect by updating
|
|
// `MetadataCacheOptions::top_level_index_pinning` when it has the
|
|
// default value, `PinningTier::kFallback`.
|
|
//
|
|
// The updated value is chosen as follows:
|
|
//
|
|
// - `pin_top_level_index_and_filter == false` ->
|
|
// `PinningTier::kNone`
|
|
// - `pin_top_level_index_and_filter == true` ->
|
|
// `PinningTier::kAll`
|
|
//
|
|
// To migrate away from this flag, explicitly configure
|
|
// `MetadataCacheOptions` as described above.
|
|
//
|
|
// If cache_index_and_filter_blocks is true and the below is true, then
|
|
// the top-level index of partitioned filter and index blocks are stored in
|
|
// the cache, but a reference is held in the "table reader" object so the
|
|
// blocks are pinned and only evicted from cache when the table reader is
|
|
// freed. This is not limited to l0 in LSM tree.
|
|
bool pin_top_level_index_and_filter = true;
|
|
|
|
// The desired block cache pinning behavior for the different categories of
|
|
// metadata blocks. While pinning can reduce block cache contention, users
|
|
// must take care not to pin excessive amounts of data, which risks
|
|
// overflowing block cache.
|
|
MetadataCacheOptions metadata_cache_options;
|
|
|
|
// The index type that will be used for this table.
|
|
enum IndexType : char {
|
|
// A space efficient index block that is optimized for
|
|
// binary-search-based index.
|
|
kBinarySearch = 0x00,
|
|
|
|
// The hash index, if enabled, will do the hash lookup when
|
|
// `Options.prefix_extractor` is provided.
|
|
kHashSearch = 0x01,
|
|
|
|
// A two-level index implementation. Both levels are binary search indexes.
|
|
// Second level index blocks ("partitions") use block cache even when
|
|
// cache_index_and_filter_blocks=false.
|
|
kTwoLevelIndexSearch = 0x02,
|
|
|
|
// Like kBinarySearch, but index also contains first key of each block.
|
|
// This allows iterators to defer reading the block until it's actually
|
|
// needed. May significantly reduce read amplification of short range scans.
|
|
// Without it, iterator seek usually reads one block from each level-0 file
|
|
// and from each level, which may be expensive.
|
|
// Works best in combination with:
|
|
// - IndexShorteningMode::kNoShortening,
|
|
// - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
|
|
// e.g. when prefix changes.
|
|
// Makes the index significantly bigger (2x or more), especially when keys
|
|
// are long.
|
|
kBinarySearchWithFirstKey = 0x03,
|
|
};
|
|
|
|
IndexType index_type = kBinarySearch;
|
|
|
|
// The index type that will be used for the data block.
|
|
enum DataBlockIndexType : char {
|
|
kDataBlockBinarySearch = 0, // traditional block type
|
|
kDataBlockBinaryAndHash = 1, // additional hash index
|
|
};
|
|
|
|
DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
|
|
|
|
// #entries/#buckets. It is valid only when data_block_hash_index_type is
|
|
// kDataBlockBinaryAndHash.
|
|
double data_block_hash_table_util_ratio = 0.75;
|
|
|
|
// Option hash_index_allow_collision is now deleted.
|
|
// It will behave as if hash_index_allow_collision=true.
|
|
|
|
// Use the specified checksum type. Newly created table files will be
|
|
// protected with this checksum type. Old table files will still be readable,
|
|
// even though they have different checksum type.
|
|
ChecksumType checksum = kCRC32c;
|
|
|
|
// Disable block cache. If this is set to true,
|
|
// then no block cache should be used, and the block_cache should
|
|
// point to a nullptr object.
|
|
bool no_block_cache = false;
|
|
|
|
// If non-NULL use the specified cache for blocks.
|
|
// If NULL, rocksdb will automatically create and use an 8MB internal cache.
|
|
std::shared_ptr<Cache> block_cache = nullptr;
|
|
|
|
// If non-NULL use the specified cache for pages read from device
|
|
// IF NULL, no page cache is used
|
|
std::shared_ptr<PersistentCache> persistent_cache = nullptr;
|
|
|
|
// If non-NULL use the specified cache for compressed blocks.
|
|
// If NULL, rocksdb will not use a compressed block cache.
|
|
// Note: though it looks similar to `block_cache`, RocksDB doesn't put the
|
|
// same type of object there.
|
|
std::shared_ptr<Cache> block_cache_compressed = nullptr;
|
|
|
|
// Approximate size of user data packed per block. Note that the
|
|
// block size specified here corresponds to uncompressed data. The
|
|
// actual size of the unit read from disk may be smaller if
|
|
// compression is enabled. This parameter can be changed dynamically.
|
|
uint64_t block_size = 4 * 1024;
|
|
|
|
// This is used to close a block before it reaches the configured
|
|
// 'block_size'. If the percentage of free space in the current block is less
|
|
// than this specified number and adding a new record to the block will
|
|
// exceed the configured block size, then this block will be closed and the
|
|
// new record will be written to the next block.
|
|
int block_size_deviation = 10;
|
|
|
|
// Number of keys between restart points for delta encoding of keys.
|
|
// This parameter can be changed dynamically. Most clients should
|
|
// leave this parameter alone. The minimum value allowed is 1. Any smaller
|
|
// value will be silently overwritten with 1.
|
|
int block_restart_interval = 16;
|
|
|
|
// Same as block_restart_interval but used for the index block.
|
|
int index_block_restart_interval = 1;
|
|
|
|
// Block size for partitioned metadata. Currently applied to indexes when
|
|
// kTwoLevelIndexSearch is used and to filters when partition_filters is used.
|
|
// Note: Since in the current implementation the filters and index partitions
|
|
// are aligned, an index/filter block is created when either index or filter
|
|
// block size reaches the specified limit.
|
|
// Note: this limit is currently applied to only index blocks; a filter
|
|
// partition is cut right after an index block is cut
|
|
// TODO(myabandeh): remove the note above when filter partitions are cut
|
|
// separately
|
|
uint64_t metadata_block_size = 4096;
|
|
|
|
// `cache_usage_options` allows users to specify the default
|
|
// options (`cache_usage_options.options`) and the overriding
|
|
// options (`cache_usage_options.options_overrides`)
|
|
// for different `CacheEntryRole` under various features related to cache
|
|
// usage.
|
|
//
|
|
// For a certain `CacheEntryRole role` and a certain feature `f` of
|
|
// `CacheEntryRoleOptions`:
|
|
// 1. If `options_overrides` has an entry for `role` and
|
|
// `options_overrides[role].f != kFallback`, we use
|
|
// `options_overrides[role].f`
|
|
// 2. Otherwise, if `options[role].f != kFallback`, we use `options[role].f`
|
|
// 3. Otherwise, we follow the compatible existing behavior for `f` (see
|
|
// each feature's comment for more)
|
|
//
|
|
// `cache_usage_options` currently supports specifying options for the
|
|
// following features:
|
|
//
|
|
// 1. Memory charging to block cache (`CacheEntryRoleOptions::charged`)
|
|
// Memory charging is a feature of accounting memory usage of specific area
|
|
// (represented by `CacheEntryRole`) toward usage in block cache (if
|
|
// available), by updating a dynamical charge to the block cache loosely based
|
|
// on the actual memory usage of that area.
|
|
//
|
|
// (a) CacheEntryRole::kCompressionDictionaryBuildingBuffer
|
|
// (i) If kEnabled:
|
|
// Charge memory usage of the buffered data used as training samples for
|
|
// dictionary compression.
|
|
// If such memory usage exceeds the avaible space left in the block cache
|
|
// at some point (i.e, causing a cache full under
|
|
// `LRUCacheOptions::strict_capacity_limit` = true), the data will then be
|
|
// unbuffered.
|
|
// (ii) If kDisabled:
|
|
// Does not charge the memory usage mentioned above.
|
|
// (iii) Compatible existing behavior:
|
|
// Same as kEnabled.
|
|
//
|
|
// (b) CacheEntryRole::kFilterConstruction
|
|
// (i) If kEnabled:
|
|
// Charge memory usage of Bloom Filter
|
|
// (format_version >= 5) and Ribbon Filter construction.
|
|
// If additional temporary memory of Ribbon Filter exceeds the avaible
|
|
// space left in the block cache at some point (i.e, causing a cache full
|
|
// under `LRUCacheOptions::strict_capacity_limit` = true),
|
|
// construction will fall back to Bloom Filter.
|
|
// (ii) If kDisabled:
|
|
// Does not charge the memory usage mentioned above.
|
|
// (iii) Compatible existing behavior:
|
|
// Same as kDisabled.
|
|
//
|
|
// (c) CacheEntryRole::kBlockBasedTableReader
|
|
// (i) If kEnabled:
|
|
// Charge memory usage of table properties +
|
|
// index block/filter block/uncompression dictionary (when stored in table
|
|
// reader i.e, BlockBasedTableOptions::cache_index_and_filter_blocks ==
|
|
// false) + some internal data structures during table reader creation.
|
|
// If such a table reader exceeds
|
|
// the avaible space left in the block cache at some point (i.e, causing
|
|
// a cache full under `LRUCacheOptions::strict_capacity_limit` = true),
|
|
// creation will fail with Status::MemoryLimit().
|
|
// (ii) If kDisabled:
|
|
// Does not charge the memory usage mentioned above.
|
|
// (iii) Compatible existing behavior:
|
|
// Same as kDisabled.
|
|
//
|
|
// (d) Other CacheEntryRole
|
|
// Not supported.
|
|
// `Status::kNotSupported` will be returned if
|
|
// `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}.
|
|
//
|
|
//
|
|
// 2. More to come ...
|
|
//
|
|
CacheUsageOptions cache_usage_options;
|
|
|
|
// Note: currently this option requires kTwoLevelIndexSearch to be set as
|
|
// well.
|
|
// TODO(myabandeh): remove the note above once the limitation is lifted
|
|
// Use partitioned full filters for each SST file. This option is
|
|
// incompatible with block-based filters. Filter partition blocks use
|
|
// block cache even when cache_index_and_filter_blocks=false.
|
|
bool partition_filters = false;
|
|
|
|
// Option to generate Bloom/Ribbon filters that minimize memory
|
|
// internal fragmentation.
|
|
//
|
|
// When false, malloc_usable_size is not available, or format_version < 5,
|
|
// filters are generated without regard to internal fragmentation when
|
|
// loaded into memory (historical behavior). When true (and
|
|
// malloc_usable_size is available and format_version >= 5), then
|
|
// filters are generated to "round up" and "round down" their sizes to
|
|
// minimize internal fragmentation when loaded into memory, assuming the
|
|
// reading DB has the same memory allocation characteristics as the
|
|
// generating DB. This option does not break forward or backward
|
|
// compatibility.
|
|
//
|
|
// While individual filters will vary in bits/key and false positive rate
|
|
// when setting is true, the implementation attempts to maintain a weighted
|
|
// average FP rate for filters consistent with this option set to false.
|
|
//
|
|
// With Jemalloc for example, this setting is expected to save about 10% of
|
|
// the memory footprint and block cache charge of filters, while increasing
|
|
// disk usage of filters by about 1-2% due to encoding efficiency losses
|
|
// with variance in bits/key.
|
|
//
|
|
// NOTE: Because some memory counted by block cache might be unmapped pages
|
|
// within internal fragmentation, this option can increase observed RSS
|
|
// memory usage. With cache_index_and_filter_blocks=true, this option makes
|
|
// the block cache better at using space it is allowed. (These issues
|
|
// should not arise with partitioned filters.)
|
|
//
|
|
// NOTE: Do not set to true if you do not trust malloc_usable_size. With
|
|
// this option, RocksDB might access an allocated memory object beyond its
|
|
// original size if malloc_usable_size says it is safe to do so. While this
|
|
// can be considered bad practice, it should not produce undefined behavior
|
|
// unless malloc_usable_size is buggy or broken.
|
|
bool optimize_filters_for_memory = false;
|
|
|
|
// Use delta encoding to compress keys in blocks.
|
|
// ReadOptions::pin_data requires this option to be disabled.
|
|
//
|
|
// Default: true
|
|
bool use_delta_encoding = true;
|
|
|
|
// If non-nullptr, use the specified filter policy to reduce disk reads.
|
|
// Many applications will benefit from passing the result of
|
|
// NewBloomFilterPolicy() here.
|
|
std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
|
|
|
|
// If true, place whole keys in the filter (not just prefixes).
|
|
// This must generally be true for gets to be efficient.
|
|
bool whole_key_filtering = true;
|
|
|
|
// If true, detect corruption during Bloom Filter (format_version >= 5)
|
|
// and Ribbon Filter construction.
|
|
//
|
|
// This is an extra check that is only
|
|
// useful in detecting software bugs or CPU+memory malfunction.
|
|
// Turning on this feature increases filter construction time by 30%.
|
|
//
|
|
// This parameter can be changed dynamically by
|
|
// DB::SetOptions({{"block_based_table_factory",
|
|
// "{detect_filter_construct_corruption=true;}"}});
|
|
//
|
|
// TODO: optimize this performance
|
|
bool detect_filter_construct_corruption = false;
|
|
|
|
// Verify that decompressing the compressed block gives back the input. This
|
|
// is a verification mode that we use to detect bugs in compression
|
|
// algorithms.
|
|
bool verify_compression = false;
|
|
|
|
// If used, For every data block we load into memory, we will create a bitmap
|
|
// of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
|
|
// will be used to figure out the percentage we actually read of the blocks.
|
|
//
|
|
// When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
|
|
// Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
|
|
// read amplification using this formula
|
|
// (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
|
|
//
|
|
// value => memory usage (percentage of loaded blocks memory)
|
|
// 1 => 12.50 %
|
|
// 2 => 06.25 %
|
|
// 4 => 03.12 %
|
|
// 8 => 01.56 %
|
|
// 16 => 00.78 %
|
|
//
|
|
// Note: This number must be a power of 2, if not it will be sanitized
|
|
// to be the next lowest power of 2, for example a value of 7 will be
|
|
// treated as 4, a value of 19 will be treated as 16.
|
|
//
|
|
// Default: 0 (disabled)
|
|
uint32_t read_amp_bytes_per_bit = 0;
|
|
|
|
// We currently have these versions:
|
|
// 0 -- This version can be read by really old RocksDB's. Doesn't support
|
|
// changing checksum type (default is CRC32).
|
|
// 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
|
|
// checksum, like xxHash. It is written by RocksDB when
|
|
// BlockBasedTableOptions::checksum is something other than kCRC32c. (version
|
|
// 0 is silently upconverted)
|
|
// 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
|
|
// encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
|
|
// don't plan to run RocksDB before version 3.10, you should probably use
|
|
// this.
|
|
// 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
|
|
// encode the keys in index blocks. If you don't plan to run RocksDB before
|
|
// version 5.15, you should probably use this.
|
|
// This option only affects newly written tables. When reading existing
|
|
// tables, the information about version is read from the footer.
|
|
// 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
|
|
// encode the values in index blocks. If you don't plan to run RocksDB before
|
|
// version 5.16 and you are using index_block_restart_interval > 1, you should
|
|
// probably use this as it would reduce the index size.
|
|
// This option only affects newly written tables. When reading existing
|
|
// tables, the information about version is read from the footer.
|
|
// 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
|
|
// filters use a generally faster and more accurate Bloom filter
|
|
// implementation, with a different schema.
|
|
uint32_t format_version = 5;
|
|
|
|
// Store index blocks on disk in compressed format. Changing this option to
|
|
// false will avoid the overhead of decompression if index blocks are evicted
|
|
// and read back
|
|
bool enable_index_compression = true;
|
|
|
|
// Align data blocks on lesser of page size and block size
|
|
bool block_align = false;
|
|
|
|
// This enum allows trading off increased index size for improved iterator
|
|
// seek performance in some situations, particularly when block cache is
|
|
// disabled (ReadOptions::fill_cache = false) and direct IO is
|
|
// enabled (DBOptions::use_direct_reads = true).
|
|
// The default mode is the best tradeoff for most use cases.
|
|
// This option only affects newly written tables.
|
|
//
|
|
// The index contains a key separating each pair of consecutive blocks.
|
|
// Let A be the highest key in one block, B the lowest key in the next block,
|
|
// and I the index entry separating these two blocks:
|
|
// [ ... A] I [B ...]
|
|
// I is allowed to be anywhere in [A, B).
|
|
// If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
|
|
// first block, then immediately fall through to the second block.
|
|
// However, if I=A, this can't happen, and we'll read only the second block.
|
|
// In kNoShortening mode, we use I=A. In other modes, we use the shortest
|
|
// key in [A, B), which usually significantly reduces index size.
|
|
//
|
|
// There's a similar story for the last index entry, which is an upper bound
|
|
// of the highest key in the file. If it's shortened and therefore
|
|
// overestimated, iterator is likely to unnecessarily read the last data block
|
|
// from each file on each seek.
|
|
enum class IndexShorteningMode : char {
|
|
// Use full keys.
|
|
kNoShortening,
|
|
// Shorten index keys between blocks, but use full key for the last index
|
|
// key, which is the upper bound of the whole file.
|
|
kShortenSeparators,
|
|
// Shorten both keys between blocks and key after last block.
|
|
kShortenSeparatorsAndSuccessor,
|
|
};
|
|
|
|
IndexShorteningMode index_shortening =
|
|
IndexShorteningMode::kShortenSeparators;
|
|
|
|
// RocksDB does auto-readahead for iterators on noticing more than two reads
|
|
// for a table file if user doesn't provide readahead_size. The readahead
|
|
// starts at BlockBasedTableOptions.initial_auto_readahead_size (default: 8KB)
|
|
// and doubles on every additional read upto max_auto_readahead_size and
|
|
// max_auto_readahead_size can be configured.
|
|
//
|
|
// Special Value: 0 - If max_auto_readahead_size is set 0 then it will disable
|
|
// the implicit auto prefetching.
|
|
// If max_auto_readahead_size provided is less
|
|
// than initial_auto_readahead_size, then RocksDB will sanitize the
|
|
// initial_auto_readahead_size and set it to max_auto_readahead_size.
|
|
//
|
|
// Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
|
|
// the blocks.
|
|
//
|
|
// Found that 256 KB readahead size provides the best performance, based on
|
|
// experiments, for auto readahead. Experiment data is in PR #3282.
|
|
//
|
|
// This parameter can be changed dynamically by
|
|
// DB::SetOptions({{"block_based_table_factory",
|
|
// "{max_auto_readahead_size=0;}"}}));
|
|
//
|
|
// Changing the value dynamically will only affect files opened after the
|
|
// change.
|
|
//
|
|
// Default: 256 KB (256 * 1024).
|
|
size_t max_auto_readahead_size = 256 * 1024;
|
|
|
|
// If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and
|
|
// filter blocks) which are already in memory into block cache at the time of
|
|
// flush. On a flush, the block that is in memory (in memtables) get flushed
|
|
// to the device. If using Direct IO, additional IO is incurred to read this
|
|
// data back into memory again, which is avoided by enabling this option. This
|
|
// further helps if the workload exhibits high temporal locality, where most
|
|
// of the reads go to recently written data. This also helps in case of
|
|
// Distributed FileSystem.
|
|
//
|
|
// This parameter can be changed dynamically by
|
|
// DB::SetOptions({{"block_based_table_factory",
|
|
// "{prepopulate_block_cache=kFlushOnly;}"}}));
|
|
enum class PrepopulateBlockCache : char {
|
|
// Disable prepopulate block cache.
|
|
kDisable,
|
|
// Prepopulate blocks during flush only.
|
|
kFlushOnly,
|
|
};
|
|
|
|
PrepopulateBlockCache prepopulate_block_cache =
|
|
PrepopulateBlockCache::kDisable;
|
|
|
|
// RocksDB does auto-readahead for iterators on noticing more than two reads
|
|
// for a table file if user doesn't provide readahead_size. The readahead size
|
|
// starts at initial_auto_readahead_size and doubles on every additional read
|
|
// upto BlockBasedTableOptions.max_auto_readahead_size.
|
|
// max_auto_readahead_size can also be configured.
|
|
//
|
|
// Scenarios:
|
|
// - If initial_auto_readahead_size is set 0 then it will disabled the
|
|
// implicit auto prefetching irrespective of max_auto_readahead_size.
|
|
// - If max_auto_readahead_size is set 0, it will disable the internal
|
|
// prefetching irrespective of initial_auto_readahead_size.
|
|
// - If initial_auto_readahead_size > max_auto_readahead_size, then RocksDB
|
|
// will sanitize the value of initial_auto_readahead_size to
|
|
// max_auto_readahead_size and readahead_size will be
|
|
// max_auto_readahead_size.
|
|
//
|
|
// Value should be provided along with KB i.e. 8 * 1024 as it will prefetch
|
|
// the blocks.
|
|
//
|
|
// This parameter can be changed dynamically by
|
|
// DB::SetOptions({{"block_based_table_factory",
|
|
// "{initial_auto_readahead_size=0;}"}}));
|
|
//
|
|
// Changing the value dynamically will only affect files opened after the
|
|
// change.
|
|
//
|
|
// Default: 8 KB (8 * 1024).
|
|
size_t initial_auto_readahead_size = 8 * 1024;
|
|
};
|
|
|
|
// Table Properties that are specific to block-based table properties.
|
|
struct BlockBasedTablePropertyNames {
|
|
// value of this properties is a fixed int32 number.
|
|
static const std::string kIndexType;
|
|
// value is "1" for true and "0" for false.
|
|
static const std::string kWholeKeyFiltering;
|
|
// value is "1" for true and "0" for false.
|
|
static const std::string kPrefixFiltering;
|
|
};
|
|
|
|
// Create default block based table factory.
|
|
extern TableFactory* NewBlockBasedTableFactory(
|
|
const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
enum EncodingType : char {
|
|
// Always write full keys without any special encoding.
|
|
kPlain,
|
|
// Find opportunity to write the same prefix once for multiple rows.
|
|
// In some cases, when a key follows a previous key with the same prefix,
|
|
// instead of writing out the full key, it just writes out the size of the
|
|
// shared prefix, as well as other bytes, to save some bytes.
|
|
//
|
|
// When using this option, the user is required to use the same prefix
|
|
// extractor to make sure the same prefix will be extracted from the same key.
|
|
// The Name() value of the prefix extractor will be stored in the file. When
|
|
// reopening the file, the name of the options.prefix_extractor given will be
|
|
// bitwise compared to the prefix extractors stored in the file. An error
|
|
// will be returned if the two don't match.
|
|
kPrefix,
|
|
};
|
|
|
|
// Table Properties that are specific to plain table properties.
|
|
struct PlainTablePropertyNames {
|
|
static const std::string kEncodingType;
|
|
static const std::string kBloomVersion;
|
|
static const std::string kNumBloomBlocks;
|
|
};
|
|
|
|
const uint32_t kPlainTableVariableLength = 0;
|
|
|
|
struct PlainTableOptions {
|
|
static const char* kName() { return "PlainTableOptions"; };
|
|
// @user_key_len: plain table has optimization for fix-sized keys, which can
|
|
// be specified via user_key_len. Alternatively, you can pass
|
|
// `kPlainTableVariableLength` if your keys have variable
|
|
// lengths.
|
|
uint32_t user_key_len = kPlainTableVariableLength;
|
|
|
|
// @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
|
|
// You may disable it by passing a zero.
|
|
int bloom_bits_per_key = 10;
|
|
|
|
// @hash_table_ratio: the desired utilization of the hash table used for
|
|
// prefix hashing.
|
|
// hash_table_ratio = number of prefixes / #buckets in the
|
|
// hash table
|
|
double hash_table_ratio = 0.75;
|
|
|
|
// @index_sparseness: inside each prefix, need to build one index record for
|
|
// how many keys for binary search inside each hash bucket.
|
|
// For encoding type kPrefix, the value will be used when
|
|
// writing to determine an interval to rewrite the full
|
|
// key. It will also be used as a suggestion and satisfied
|
|
// when possible.
|
|
size_t index_sparseness = 16;
|
|
|
|
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
|
|
// Otherwise from huge page TLB. The user needs to
|
|
// reserve huge pages for it to be allocated, like:
|
|
// sysctl -w vm.nr_hugepages=20
|
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
|
size_t huge_page_tlb_size = 0;
|
|
|
|
// @encoding_type: how to encode the keys. See enum EncodingType above for
|
|
// the choices. The value will determine how to encode keys
|
|
// when writing to a new SST file. This value will be stored
|
|
// inside the SST file which will be used when reading from
|
|
// the file, which makes it possible for users to choose
|
|
// different encoding type when reopening a DB. Files with
|
|
// different encoding types can co-exist in the same DB and
|
|
// can be read.
|
|
EncodingType encoding_type = kPlain;
|
|
|
|
// @full_scan_mode: mode for reading the whole file one record by one without
|
|
// using the index.
|
|
bool full_scan_mode = false;
|
|
|
|
// @store_index_in_file: compute plain table index and bloom filter during
|
|
// file building and store it in file. When reading
|
|
// file, index will be mapped instead of recomputation.
|
|
bool store_index_in_file = false;
|
|
};
|
|
|
|
// -- Plain Table with prefix-only seek
|
|
// For this factory, you need to set Options.prefix_extractor properly to make
|
|
// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
|
|
// the hash bucket found, a binary search is executed for hash conflicts.
|
|
// Finally, a linear search is used.
|
|
|
|
extern TableFactory* NewPlainTableFactory(
|
|
const PlainTableOptions& options = PlainTableOptions());
|
|
|
|
struct CuckooTablePropertyNames {
|
|
// The key that is used to fill empty buckets.
|
|
static const std::string kEmptyKey;
|
|
// Fixed length of value.
|
|
static const std::string kValueLength;
|
|
// Number of hash functions used in Cuckoo Hash.
|
|
static const std::string kNumHashFunc;
|
|
// It denotes the number of buckets in a Cuckoo Block. Given a key and a
|
|
// particular hash function, a Cuckoo Block is a set of consecutive buckets,
|
|
// where starting bucket id is given by the hash function on the key. In case
|
|
// of a collision during inserting the key, the builder tries to insert the
|
|
// key in other locations of the cuckoo block before using the next hash
|
|
// function. This reduces cache miss during read operation in case of
|
|
// collision.
|
|
static const std::string kCuckooBlockSize;
|
|
// Size of the hash table. Use this number to compute the modulo of hash
|
|
// function. The actual number of buckets will be kMaxHashTableSize +
|
|
// kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
|
|
// accommodate the Cuckoo Block from end of hash table, due to cache friendly
|
|
// implementation.
|
|
static const std::string kHashTableSize;
|
|
// Denotes if the key sorted in the file is Internal Key (if false)
|
|
// or User Key only (if true).
|
|
static const std::string kIsLastLevel;
|
|
// Indicate if using identity function for the first hash function.
|
|
static const std::string kIdentityAsFirstHash;
|
|
// Indicate if using module or bit and to calculate hash value
|
|
static const std::string kUseModuleHash;
|
|
// Fixed user key length
|
|
static const std::string kUserKeyLength;
|
|
};
|
|
|
|
struct CuckooTableOptions {
|
|
static const char* kName() { return "CuckooTableOptions"; };
|
|
|
|
// Determines the utilization of hash tables. Smaller values
|
|
// result in larger hash tables with fewer collisions.
|
|
double hash_table_ratio = 0.9;
|
|
// A property used by builder to determine the depth to go to
|
|
// to search for a path to displace elements in case of
|
|
// collision. See Builder.MakeSpaceForKey method. Higher
|
|
// values result in more efficient hash tables with fewer
|
|
// lookups but take more time to build.
|
|
uint32_t max_search_depth = 100;
|
|
// In case of collision while inserting, the builder
|
|
// attempts to insert in the next cuckoo_block_size
|
|
// locations before skipping over to the next Cuckoo hash
|
|
// function. This makes lookups more cache friendly in case
|
|
// of collisions.
|
|
uint32_t cuckoo_block_size = 5;
|
|
// If this option is enabled, user key is treated as uint64_t and its value
|
|
// is used as hash value directly. This option changes builder's behavior.
|
|
// Reader ignore this option and behave according to what specified in table
|
|
// property.
|
|
bool identity_as_first_hash = false;
|
|
// If this option is set to true, module is used during hash calculation.
|
|
// This often yields better space efficiency at the cost of performance.
|
|
// If this option is set to false, # of entries in table is constrained to be
|
|
// power of two, and bit and is used to calculate hash, which is faster in
|
|
// general.
|
|
bool use_module_hash = true;
|
|
};
|
|
|
|
// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
|
|
extern TableFactory* NewCuckooTableFactory(
|
|
const CuckooTableOptions& table_options = CuckooTableOptions());
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
class RandomAccessFileReader;
|
|
|
|
// A base class for table factories.
|
|
class TableFactory : public Customizable {
|
|
public:
|
|
virtual ~TableFactory() override {}
|
|
|
|
static const char* kBlockCacheOpts() { return "BlockCache"; };
|
|
static const char* kBlockBasedTableName() { return "BlockBasedTable"; };
|
|
static const char* kPlainTableName() { return "PlainTable"; }
|
|
static const char* kCuckooTableName() { return "CuckooTable"; };
|
|
|
|
// Creates and configures a new TableFactory from the input options and id.
|
|
static Status CreateFromString(const ConfigOptions& config_options,
|
|
const std::string& id,
|
|
std::shared_ptr<TableFactory>* factory);
|
|
|
|
static const char* Type() { return "TableFactory"; }
|
|
|
|
// Returns a Table object table that can fetch data from file specified
|
|
// in parameter file. It's the caller's responsibility to make sure
|
|
// file is in the correct format.
|
|
//
|
|
// NewTableReader() is called in three places:
|
|
// (1) TableCache::FindTable() calls the function when table cache miss
|
|
// and cache the table object returned.
|
|
// (2) SstFileDumper (for SST Dump) opens the table and dump the table
|
|
// contents using the iterator of the table.
|
|
// (3) DBImpl::IngestExternalFile() calls this function to read the contents
|
|
// of the sst file it's attempting to add
|
|
//
|
|
// table_reader_options is a TableReaderOptions which contain all the
|
|
// needed parameters and configuration to open the table.
|
|
// file is a file handler to handle the file for the table.
|
|
// file_size is the physical file size of the file.
|
|
// table_reader is the output table reader.
|
|
virtual Status NewTableReader(
|
|
const TableReaderOptions& table_reader_options,
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
bool prefetch_index_and_filter_in_cache = true) const {
|
|
ReadOptions ro;
|
|
return NewTableReader(ro, table_reader_options, std::move(file), file_size,
|
|
table_reader, prefetch_index_and_filter_in_cache);
|
|
}
|
|
|
|
// Overload of the above function that allows the caller to pass in a
|
|
// ReadOptions
|
|
virtual Status NewTableReader(
|
|
const ReadOptions& ro, const TableReaderOptions& table_reader_options,
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
bool prefetch_index_and_filter_in_cache) const = 0;
|
|
|
|
// Return a table builder to write to a file for this table type.
|
|
//
|
|
// It is called in several places:
|
|
// (1) When flushing memtable to a level-0 output file, it creates a table
|
|
// builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
|
|
// (2) During compaction, it gets the builder for writing compaction output
|
|
// files in DBImpl::OpenCompactionOutputFile().
|
|
// (3) When recovering from transaction logs, it creates a table builder to
|
|
// write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
|
|
// by calling BuildTable())
|
|
// (4) When running Repairer, it creates a table builder to convert logs to
|
|
// SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
|
|
//
|
|
// Multiple configured can be accessed from there, including and not limited
|
|
// to compression options. file is a handle of a writable file.
|
|
// It is the caller's responsibility to keep the file open and close the file
|
|
// after closing the table builder. compression_type is the compression type
|
|
// to use in this table.
|
|
virtual TableBuilder* NewTableBuilder(
|
|
const TableBuilderOptions& table_builder_options,
|
|
WritableFileWriter* file) const = 0;
|
|
|
|
// Return is delete range supported
|
|
virtual bool IsDeleteRangeSupported() const { return false; }
|
|
};
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
// Create a special table factory that can open either of the supported
|
|
// table formats, based on setting inside the SST files. It should be used to
|
|
// convert a DB from one table format to another.
|
|
// @table_factory_to_write: the table factory used when writing to new files.
|
|
// @block_based_table_factory: block based table factory to use. If NULL, use
|
|
// a default one.
|
|
// @plain_table_factory: plain table factory to use. If NULL, use a default one.
|
|
// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
|
|
// one.
|
|
extern TableFactory* NewAdaptiveTableFactory(
|
|
std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
|
|
std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
|
|
std::shared_ptr<TableFactory> plain_table_factory = nullptr,
|
|
std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|