|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
#include "rocksdb/table.h"
|
|
|
|
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <iostream>
|
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
|
|
|
#include <string>
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
#include <unordered_set>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "cache/lru_cache.h"
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include "db/write_batch_internal.h"
|
|
|
|
#include "memtable/stl_wrappers.h"
|
|
|
|
#include "monitoring/statistics.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
#include "options/options_helper.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "port/stack_trace.h"
|
|
|
|
#include "rocksdb/cache.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
#include "rocksdb/compression_type.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/file_checksum.h"
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
#include "rocksdb/file_system.h"
|
|
|
|
#include "rocksdb/filter_policy.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
|
|
|
#include "rocksdb/memtablerep.h"
|
|
|
|
#include "rocksdb/perf_context.h"
|
|
|
|
#include "rocksdb/slice_transform.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
#include "rocksdb/table_properties.h"
|
|
|
|
#include "rocksdb/trace_record.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
#include "rocksdb/unique_id.h"
|
|
|
|
#include "rocksdb/write_buffer_manager.h"
|
|
|
|
#include "table/block_based/block.h"
|
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/block_based/block_builder.h"
|
|
|
|
#include "table/block_based/flush_block_policy.h"
|
|
|
|
#include "table/block_fetcher.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
#include "table/get_context.h"
|
|
|
|
#include "table/internal_iterator.h"
|
|
|
|
#include "table/meta_blocks.h"
|
|
|
|
#include "table/plain/plain_table_factory.h"
|
|
|
|
#include "table/scoped_arena_iterator.h"
|
|
|
|
#include "table/sst_file_writer_collectors.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
#include "table/unique_id_impl.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
#include "util/coding_lean.h"
|
|
|
|
#include "util/compression.h"
|
|
|
|
#include "util/file_checksum_helper.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
#include "utilities/memory_allocators.h"
|
|
|
|
#include "utilities/merge_operators.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
extern const uint64_t kLegacyBlockBasedTableMagicNumber;
|
|
|
|
extern const uint64_t kLegacyPlainTableMagicNumber;
|
|
|
|
extern const uint64_t kBlockBasedTableMagicNumber;
|
|
|
|
extern const uint64_t kPlainTableMagicNumber;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
const std::string kDummyValue(10000, 'o');
|
|
|
|
|
|
|
|
// DummyPropertiesCollector used to test BlockBasedTableProperties
|
|
|
|
class DummyPropertiesCollector : public TablePropertiesCollector {
|
|
|
|
public:
|
|
|
|
const char* Name() const override { return "DummyPropertiesCollector"; }
|
|
|
|
|
|
|
|
Status Finish(UserCollectedProperties* /*properties*/) override {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status Add(const Slice& /*user_key*/, const Slice& /*value*/) override {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
UserCollectedProperties GetReadableProperties() const override {
|
|
|
|
return UserCollectedProperties{};
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
class DummyPropertiesCollectorFactory1
|
|
|
|
: public TablePropertiesCollectorFactory {
|
|
|
|
public:
|
|
|
|
TablePropertiesCollector* CreateTablePropertiesCollector(
|
|
|
|
TablePropertiesCollectorFactory::Context /*context*/) override {
|
|
|
|
return new DummyPropertiesCollector();
|
|
|
|
}
|
|
|
|
const char* Name() const override {
|
|
|
|
return "DummyPropertiesCollectorFactory1";
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
class DummyPropertiesCollectorFactory2
|
|
|
|
: public TablePropertiesCollectorFactory {
|
|
|
|
public:
|
|
|
|
TablePropertiesCollector* CreateTablePropertiesCollector(
|
|
|
|
TablePropertiesCollectorFactory::Context /*context*/) override {
|
|
|
|
return new DummyPropertiesCollector();
|
|
|
|
}
|
|
|
|
const char* Name() const override {
|
|
|
|
return "DummyPropertiesCollectorFactory2";
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Return reverse of "key".
|
|
|
|
// Used to test non-lexicographic comparators.
|
|
|
|
std::string Reverse(const Slice& key) {
|
|
|
|
auto rev = key.ToString();
|
|
|
|
std::reverse(rev.begin(), rev.end());
|
|
|
|
return rev;
|
|
|
|
}
|
|
|
|
|
|
|
|
class ReverseKeyComparator : public Comparator {
|
|
|
|
public:
|
|
|
|
const char* Name() const override {
|
|
|
|
return "rocksdb.ReverseBytewiseComparator";
|
|
|
|
}
|
|
|
|
|
|
|
|
int Compare(const Slice& a, const Slice& b) const override {
|
|
|
|
return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
|
|
|
|
}
|
|
|
|
|
|
|
|
void FindShortestSeparator(std::string* start,
|
|
|
|
const Slice& limit) const override {
|
|
|
|
std::string s = Reverse(*start);
|
|
|
|
std::string l = Reverse(limit);
|
|
|
|
BytewiseComparator()->FindShortestSeparator(&s, l);
|
|
|
|
*start = Reverse(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FindShortSuccessor(std::string* key) const override {
|
|
|
|
std::string s = Reverse(*key);
|
|
|
|
BytewiseComparator()->FindShortSuccessor(&s);
|
|
|
|
*key = Reverse(s);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
ReverseKeyComparator reverse_key_comparator;
|
|
|
|
|
|
|
|
void Increment(const Comparator* cmp, std::string* key) {
|
|
|
|
if (cmp == BytewiseComparator()) {
|
|
|
|
key->push_back('\0');
|
|
|
|
} else {
|
|
|
|
assert(cmp == &reverse_key_comparator);
|
|
|
|
std::string rev = Reverse(*key);
|
|
|
|
rev.push_back('\0');
|
|
|
|
*key = Reverse(rev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto kUnknownColumnFamily =
|
|
|
|
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Helper class for tests to unify the interface between
|
|
|
|
// BlockBuilder/TableBuilder and Block/Table.
|
|
|
|
class Constructor {
|
|
|
|
public:
|
|
|
|
explicit Constructor(const Comparator* cmp)
|
|
|
|
: data_(stl_wrappers::LessOfComparator(cmp)) {}
|
|
|
|
virtual ~Constructor() { }
|
|
|
|
|
|
|
|
void Add(const std::string& key, const Slice& value) {
|
|
|
|
data_[key] = value.ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finish constructing the data structure with all the keys that have
|
|
|
|
// been added so far. Returns the keys in sorted order in "*keys"
|
|
|
|
// and stores the key/value pairs in "*kvmap"
|
|
|
|
void Finish(const Options& options, const ImmutableOptions& ioptions,
|
|
|
|
const MutableCFOptions& moptions,
|
|
|
|
const BlockBasedTableOptions& table_options,
|
|
|
|
const InternalKeyComparator& internal_comparator,
|
|
|
|
std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
3 years ago
|
|
|
last_internal_comparator_ = &internal_comparator;
|
|
|
|
*kvmap = data_;
|
|
|
|
keys->clear();
|
|
|
|
for (const auto& kv : data_) {
|
|
|
|
keys->push_back(kv.first);
|
|
|
|
}
|
|
|
|
data_.clear();
|
|
|
|
Status s = FinishImpl(options, ioptions, moptions, table_options,
|
|
|
|
internal_comparator, *kvmap);
|
|
|
|
ASSERT_TRUE(s.ok()) << s.ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Construct the data structure from the data in "data"
|
|
|
|
virtual Status FinishImpl(const Options& options,
|
|
|
|
const ImmutableOptions& ioptions,
|
|
|
|
const MutableCFOptions& moptions,
|
|
|
|
const BlockBasedTableOptions& table_options,
|
|
|
|
const InternalKeyComparator& internal_comparator,
|
|
|
|
const stl_wrappers::KVMap& data) = 0;
|
|
|
|
|
|
|
|
virtual InternalIterator* NewIterator(
|
|
|
|
const SliceTransform* prefix_extractor = nullptr) const = 0;
|
|
|
|
|
|
|
|
virtual const stl_wrappers::KVMap& data() { return data_; }
|
|
|
|
|
|
|
|
virtual bool IsArenaMode() const { return false; }
|
|
|
|
|
|
|
|
virtual DB* db() const { return nullptr; } // Overridden in DBConstructor
|
|
|
|
|
|
|
|
virtual bool AnywayDeleteIterator() const { return false; }
|
|
|
|
|
|
|
|
protected:
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
3 years ago
|
|
|
const InternalKeyComparator* last_internal_comparator_;
|
|
|
|
|
|
|
|
private:
|
|
|
|
stl_wrappers::KVMap data_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// A helper class that converts internal format keys into user keys
|
|
|
|
class KeyConvertingIterator : public InternalIterator {
|
|
|
|
public:
|
|
|
|
explicit KeyConvertingIterator(InternalIterator* iter,
|
|
|
|
bool arena_mode = false)
|
|
|
|
: iter_(iter), arena_mode_(arena_mode) {}
|
|
|
|
~KeyConvertingIterator() override {
|
|
|
|
if (arena_mode_) {
|
|
|
|
iter_->~InternalIterator();
|
|
|
|
} else {
|
|
|
|
delete iter_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bool Valid() const override { return iter_->Valid() && status_.ok(); }
|
|
|
|
void Seek(const Slice& target) override {
|
|
|
|
ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
iter_->Seek(encoded);
|
|
|
|
}
|
|
|
|
void SeekForPrev(const Slice& target) override {
|
|
|
|
ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
iter_->SeekForPrev(encoded);
|
|
|
|
}
|
|
|
|
void SeekToFirst() override { iter_->SeekToFirst(); }
|
|
|
|
void SeekToLast() override { iter_->SeekToLast(); }
|
|
|
|
void Next() override { iter_->Next(); }
|
|
|
|
void Prev() override { iter_->Prev(); }
|
|
|
|
IterBoundCheck UpperBoundCheckResult() override {
|
|
|
|
return iter_->UpperBoundCheckResult();
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice key() const override {
|
|
|
|
assert(Valid());
|
|
|
|
ParsedInternalKey parsed_key;
|
|
|
|
Status pik_status =
|
|
|
|
ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */);
|
|
|
|
if (!pik_status.ok()) {
|
|
|
|
status_ = pik_status;
|
|
|
|
return Slice(status_.getState());
|
|
|
|
}
|
|
|
|
return parsed_key.user_key;
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice value() const override { return iter_->value(); }
|
|
|
|
Status status() const override {
|
|
|
|
return status_.ok() ? iter_->status() : status_;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
mutable Status status_;
|
|
|
|
InternalIterator* iter_;
|
|
|
|
bool arena_mode_;
|
|
|
|
|
|
|
|
// No copying allowed
|
|
|
|
KeyConvertingIterator(const KeyConvertingIterator&);
|
|
|
|
void operator=(const KeyConvertingIterator&);
|
|
|
|
};
|
|
|
|
|
|
|
|
// `BlockConstructor` APIs always accept/return user keys.
|
|
|
|
class BlockConstructor : public Constructor {
|
|
|
|
public:
|
|
|
|
explicit BlockConstructor(const Comparator* cmp)
|
|
|
|
: Constructor(cmp), comparator_(cmp), block_(nullptr) {}
|
|
|
|
~BlockConstructor() override { delete block_; }
|
|
|
|
Status FinishImpl(const Options& /*options*/,
|
|
|
|
const ImmutableOptions& /*ioptions*/,
|
|
|
|
const MutableCFOptions& /*moptions*/,
|
|
|
|
const BlockBasedTableOptions& table_options,
|
|
|
|
const InternalKeyComparator& /*internal_comparator*/,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
|
|
|
delete block_;
|
|
|
|
block_ = nullptr;
|
|
|
|
BlockBuilder builder(table_options.block_restart_interval);
|
|
|
|
|
|
|
|
for (const auto& kv : kv_map) {
|
|
|
|
// `DataBlockIter` assumes it reads only internal keys. `BlockConstructor`
|
|
|
|
// clients provide user keys, so we need to convert to internal key format
|
|
|
|
// before writing the data block.
|
|
|
|
ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
builder.Add(encoded, kv.second);
|
|
|
|
}
|
|
|
|
// Open the block
|
|
|
|
data_ = builder.Finish().ToString();
|
|
|
|
BlockContents contents;
|
|
|
|
contents.data = data_;
|
|
|
|
block_ = new Block(std::move(contents));
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
InternalIterator* NewIterator(
|
|
|
|
const SliceTransform* /*prefix_extractor*/) const override {
|
|
|
|
// `DataBlockIter` returns the internal keys it reads.
|
|
|
|
// `KeyConvertingIterator` converts them to user keys before they are
|
|
|
|
// exposed to the `BlockConstructor` clients.
|
|
|
|
return new KeyConvertingIterator(
|
|
|
|
block_->NewDataIterator(comparator_, kDisableGlobalSequenceNumber));
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const Comparator* comparator_;
|
|
|
|
std::string data_;
|
|
|
|
Block* block_;
|
|
|
|
|
|
|
|
BlockConstructor();
|
|
|
|
};
|
|
|
|
|
|
|
|
class TableConstructor : public Constructor {
|
|
|
|
public:
|
|
|
|
explicit TableConstructor(const Comparator* cmp,
|
|
|
|
bool convert_to_internal_key = false,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
int level = -1, SequenceNumber largest_seqno = 0)
|
|
|
|
: Constructor(cmp),
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
largest_seqno_(largest_seqno),
|
|
|
|
convert_to_internal_key_(convert_to_internal_key),
|
|
|
|
level_(level) {
|
|
|
|
env_ = ROCKSDB_NAMESPACE::Env::Default();
|
|
|
|
}
|
|
|
|
~TableConstructor() override { Reset(); }
|
|
|
|
|
|
|
|
Status FinishImpl(const Options& options, const ImmutableOptions& ioptions,
|
|
|
|
const MutableCFOptions& moptions,
|
|
|
|
const BlockBasedTableOptions& /*table_options*/,
|
|
|
|
const InternalKeyComparator& internal_comparator,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
|
|
|
Reset();
|
|
|
|
soptions.use_mmap_reads = ioptions.allow_mmap_reads;
|
|
|
|
std::unique_ptr<FSWritableFile> sink(new test::StringSink());
|
|
|
|
file_writer_.reset(new WritableFileWriter(
|
|
|
|
std::move(sink), "" /* don't care */, FileOptions()));
|
|
|
|
std::unique_ptr<TableBuilder> builder;
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
|
|
|
|
if (largest_seqno_ != 0) {
|
|
|
|
// Pretend that it's an external file written by SstFileWriter.
|
|
|
|
int_tbl_prop_collector_factories.emplace_back(
|
|
|
|
new SstFileWriterPropertiesCollectorFactory(2 /* version */,
|
|
|
|
0 /* global_seqno*/));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string column_family_name;
|
|
|
|
builder.reset(ioptions.table_factory->NewTableBuilder(
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
6 years ago
|
|
|
TableBuilderOptions(ioptions, moptions, internal_comparator,
|
|
|
|
&int_tbl_prop_collector_factories,
|
|
|
|
options.compression, options.compression_opts,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
kUnknownColumnFamily, column_family_name, level_),
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
9 years ago
|
|
|
file_writer_.get()));
|
|
|
|
|
|
|
|
for (const auto& kv : kv_map) {
|
|
|
|
if (convert_to_internal_key_) {
|
|
|
|
ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
builder->Add(encoded, kv.second);
|
|
|
|
} else {
|
|
|
|
builder->Add(kv.first, kv.second);
|
|
|
|
}
|
|
|
|
EXPECT_OK(builder->status());
|
|
|
|
}
|
|
|
|
Status s = builder->Finish();
|
|
|
|
EXPECT_OK(file_writer_->Flush());
|
rocksdb: Replace ASSERT* with EXPECT* in functions that does not return void value
Summary:
gtest does not use exceptions to fail a unit test by design, and `ASSERT*`s are implemented using `return`. As a consequence we cannot use `ASSERT*` in a function that does not return `void` value ([[ https://code.google.com/p/googletest/wiki/AdvancedGuide#Assertion_Placement | 1]]), and have to fix our existing code. This diff does this in a generic way, with no manual changes.
In order to detect all existing `ASSERT*` that are used in functions that doesn't return void value, I change the code to generate compile errors for such cases.
In `util/testharness.h` I defined `EXPECT*` assertions, the same way as `ASSERT*`, and redefined `ASSERT*` to return `void`. Then executed:
```lang=bash
% USE_CLANG=1 make all -j55 -k 2> build.log
% perl -naF: -e 'print "-- -number=".$F[1]." ".$F[0]."\n" if /: error:/' \
build.log | xargs -L 1 perl -spi -e 's/ASSERT/EXPECT/g if $. == $number'
% make format
```
After that I reverted back change to `ASSERT*` in `util/testharness.h`. But preserved introduced `EXPECT*`, which is the same as `ASSERT*`. This will be deleted once switched to gtest.
This diff is independent and contains manual changes only in `util/testharness.h`.
Test Plan:
Make sure all tests are passing.
```lang=bash
% USE_CLANG=1 make check
```
Reviewers: igor, lgalanis, sdong, yufei.zhu, rven, meyering
Reviewed By: meyering
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D33333
10 years ago
|
|
|
EXPECT_TRUE(s.ok()) << s.ToString();
|
|
|
|
|
|
|
|
EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
|
|
|
|
|
|
|
|
// Open the table
|
|
|
|
uniq_id_ = cur_uniq_id_++;
|
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
3 years ago
|
|
|
return Reopen(ioptions, moptions);
|
|
|
|
}
|
|
|
|
|
|
|
|
InternalIterator* NewIterator(
|
|
|
|
const SliceTransform* prefix_extractor) const override {
|
|
|
|
InternalIterator* iter = table_reader_->NewIterator(
|
|
|
|
read_options_, prefix_extractor, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized);
|
|
|
|
if (convert_to_internal_key_) {
|
|
|
|
return new KeyConvertingIterator(iter);
|
|
|
|
} else {
|
|
|
|
return iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ApproximateOffsetOf(const Slice& key) const {
|
|
|
|
if (convert_to_internal_key_) {
|
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
const Slice skey = ikey.Encode();
|
|
|
|
return table_reader_->ApproximateOffsetOf(
|
|
|
|
skey, TableReaderCaller::kUncategorized);
|
|
|
|
}
|
|
|
|
return table_reader_->ApproximateOffsetOf(
|
|
|
|
key, TableReaderCaller::kUncategorized);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status Reopen(const ImmutableOptions& ioptions,
|
|
|
|
const MutableCFOptions& moptions) {
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
|
|
|
|
TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads));
|
|
|
|
|
|
|
|
file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
return ioptions.table_factory->NewTableReader(
|
|
|
|
TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
3 years ago
|
|
|
*last_internal_comparator_, /*skip_filters*/ false,
|
|
|
|
/*immortal*/ false, false, level_, largest_seqno_,
|
|
|
|
&block_cache_tracer_, moptions.write_buffer_size, "",
|
|
|
|
uniq_id_),
|
|
|
|
std::move(file_reader_), TEST_GetSink()->contents().size(),
|
|
|
|
&table_reader_);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual TableReader* GetTableReader() { return table_reader_.get(); }
|
|
|
|
|
|
|
|
bool AnywayDeleteIterator() const override {
|
|
|
|
return convert_to_internal_key_;
|
|
|
|
}
|
|
|
|
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
void ResetTableReader() { table_reader_.reset(); }
|
|
|
|
|
|
|
|
bool ConvertToInternalKey() { return convert_to_internal_key_; }
|
|
|
|
|
|
|
|
test::StringSink* TEST_GetSink() {
|
|
|
|
return static_cast<test::StringSink*>(file_writer_->writable_file());
|
|
|
|
}
|
|
|
|
|
|
|
|
BlockCacheTracer block_cache_tracer_;
|
|
|
|
|
|
|
|
private:
|
|
|
|
void Reset() {
|
|
|
|
uniq_id_ = 0;
|
|
|
|
table_reader_.reset();
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
9 years ago
|
|
|
file_writer_.reset();
|
|
|
|
file_reader_.reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
const ReadOptions read_options_;
|
|
|
|
uint64_t uniq_id_;
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer_;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader_;
|
|
|
|
std::unique_ptr<TableReader> table_reader_;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
SequenceNumber largest_seqno_;
|
|
|
|
bool convert_to_internal_key_;
|
|
|
|
int level_;
|
|
|
|
|
|
|
|
TableConstructor();
|
|
|
|
|
|
|
|
static uint64_t cur_uniq_id_;
|
|
|
|
EnvOptions soptions;
|
|
|
|
Env* env_;
|
|
|
|
};
|
|
|
|
uint64_t TableConstructor::cur_uniq_id_ = 1;
|
|
|
|
|
|
|
|
class MemTableConstructor: public Constructor {
|
|
|
|
public:
|
|
|
|
explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb)
|
|
|
|
: Constructor(cmp),
|
|
|
|
internal_comparator_(cmp),
|
|
|
|
write_buffer_manager_(wb),
|
|
|
|
table_factory_(new SkipListFactory) {
|
|
|
|
options_.memtable_factory = table_factory_;
|
|
|
|
ImmutableOptions ioptions(options_);
|
|
|
|
memtable_ =
|
|
|
|
new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_),
|
|
|
|
wb, kMaxSequenceNumber, 0 /* column_family_id */);
|
|
|
|
memtable_->Ref();
|
|
|
|
}
|
|
|
|
~MemTableConstructor() override { delete memtable_->Unref(); }
|
|
|
|
Status FinishImpl(const Options&, const ImmutableOptions& ioptions,
|
|
|
|
const MutableCFOptions& /*moptions*/,
|
|
|
|
const BlockBasedTableOptions& /*table_options*/,
|
|
|
|
const InternalKeyComparator& /*internal_comparator*/,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
|
|
|
delete memtable_->Unref();
|
|
|
|
ImmutableOptions mem_ioptions(ioptions);
|
|
|
|
memtable_ = new MemTable(internal_comparator_, mem_ioptions,
|
|
|
|
MutableCFOptions(options_), write_buffer_manager_,
|
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
|
|
memtable_->Ref();
|
|
|
|
int seq = 1;
|
|
|
|
for (const auto& kv : kv_map) {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
4 years ago
|
|
|
Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second,
|
|
|
|
nullptr /* kv_prot_info */);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
seq++;
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
InternalIterator* NewIterator(
|
|
|
|
const SliceTransform* /*prefix_extractor*/) const override {
|
|
|
|
return new KeyConvertingIterator(
|
|
|
|
memtable_->NewIterator(ReadOptions(), &arena_), true);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AnywayDeleteIterator() const override { return true; }
|
|
|
|
|
|
|
|
bool IsArenaMode() const override { return true; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
mutable Arena arena_;
|
|
|
|
InternalKeyComparator internal_comparator_;
|
|
|
|
Options options_;
|
|
|
|
WriteBufferManager* write_buffer_manager_;
|
|
|
|
MemTable* memtable_;
|
|
|
|
std::shared_ptr<SkipListFactory> table_factory_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class InternalIteratorFromIterator : public InternalIterator {
|
|
|
|
public:
|
|
|
|
explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {}
|
|
|
|
bool Valid() const override { return it_->Valid(); }
|
|
|
|
void Seek(const Slice& target) override { it_->Seek(target); }
|
|
|
|
void SeekForPrev(const Slice& target) override { it_->SeekForPrev(target); }
|
|
|
|
void SeekToFirst() override { it_->SeekToFirst(); }
|
|
|
|
void SeekToLast() override { it_->SeekToLast(); }
|
|
|
|
void Next() override { it_->Next(); }
|
|
|
|
void Prev() override { it_->Prev(); }
|
|
|
|
Slice key() const override { return it_->key(); }
|
|
|
|
Slice value() const override { return it_->value(); }
|
|
|
|
Status status() const override { return it_->status(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<Iterator> it_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class DBConstructor: public Constructor {
|
|
|
|
public:
|
|
|
|
explicit DBConstructor(const Comparator* cmp)
|
|
|
|
: Constructor(cmp),
|
|
|
|
comparator_(cmp) {
|
|
|
|
db_ = nullptr;
|
|
|
|
NewDB();
|
|
|
|
}
|
|
|
|
~DBConstructor() override { delete db_; }
|
|
|
|
Status FinishImpl(const Options& /*options*/,
|
|
|
|
const ImmutableOptions& /*ioptions*/,
|
|
|
|
const MutableCFOptions& /*moptions*/,
|
|
|
|
const BlockBasedTableOptions& /*table_options*/,
|
|
|
|
const InternalKeyComparator& /*internal_comparator*/,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
|
|
|
delete db_;
|
|
|
|
db_ = nullptr;
|
|
|
|
NewDB();
|
|
|
|
for (const auto& kv : kv_map) {
|
|
|
|
WriteBatch batch;
|
|
|
|
EXPECT_OK(batch.Put(kv.first, kv.second));
|
rocksdb: Replace ASSERT* with EXPECT* in functions that does not return void value
Summary:
gtest does not use exceptions to fail a unit test by design, and `ASSERT*`s are implemented using `return`. As a consequence we cannot use `ASSERT*` in a function that does not return `void` value ([[ https://code.google.com/p/googletest/wiki/AdvancedGuide#Assertion_Placement | 1]]), and have to fix our existing code. This diff does this in a generic way, with no manual changes.
In order to detect all existing `ASSERT*` that are used in functions that doesn't return void value, I change the code to generate compile errors for such cases.
In `util/testharness.h` I defined `EXPECT*` assertions, the same way as `ASSERT*`, and redefined `ASSERT*` to return `void`. Then executed:
```lang=bash
% USE_CLANG=1 make all -j55 -k 2> build.log
% perl -naF: -e 'print "-- -number=".$F[1]." ".$F[0]."\n" if /: error:/' \
build.log | xargs -L 1 perl -spi -e 's/ASSERT/EXPECT/g if $. == $number'
% make format
```
After that I reverted back change to `ASSERT*` in `util/testharness.h`. But preserved introduced `EXPECT*`, which is the same as `ASSERT*`. This will be deleted once switched to gtest.
This diff is independent and contains manual changes only in `util/testharness.h`.
Test Plan:
Make sure all tests are passing.
```lang=bash
% USE_CLANG=1 make check
```
Reviewers: igor, lgalanis, sdong, yufei.zhu, rven, meyering
Reviewed By: meyering
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D33333
10 years ago
|
|
|
EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok());
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
InternalIterator* NewIterator(
|
|
|
|
const SliceTransform* /*prefix_extractor*/) const override {
|
|
|
|
return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
|
|
|
|
}
|
|
|
|
|
|
|
|
DB* db() const override { return db_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
void NewDB() {
|
|
|
|
std::string name = test::PerThreadDBPath("table_testdb");
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = comparator_;
|
|
|
|
Status status = DestroyDB(name, options);
|
|
|
|
ASSERT_TRUE(status.ok()) << status.ToString();
|
|
|
|
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.error_if_exists = true;
|
|
|
|
options.write_buffer_size = 10000; // Something small to force merging
|
|
|
|
status = DB::Open(options, name, &db_);
|
|
|
|
ASSERT_TRUE(status.ok()) << status.ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
const Comparator* comparator_;
|
|
|
|
DB* db_;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum TestType {
|
|
|
|
BLOCK_BASED_TABLE_TEST,
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
PLAIN_TABLE_SEMI_FIXED_PREFIX,
|
|
|
|
PLAIN_TABLE_FULL_STR_PREFIX,
|
|
|
|
PLAIN_TABLE_TOTAL_ORDER,
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
BLOCK_TEST,
|
|
|
|
MEMTABLE_TEST,
|
|
|
|
DB_TEST
|
|
|
|
};
|
|
|
|
|
|
|
|
struct TestArgs {
|
|
|
|
TestType type;
|
|
|
|
bool reverse_compare;
|
|
|
|
int restart_interval;
|
|
|
|
CompressionType compression;
|
|
|
|
uint32_t compression_parallel_threads;
|
|
|
|
uint32_t format_version;
|
|
|
|
bool use_mmap;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& os, const TestArgs& args) {
|
|
|
|
os << "type: " << args.type << " reverse_compare: " << args.reverse_compare
|
|
|
|
<< " restart_interval: " << args.restart_interval
|
|
|
|
<< " compression: " << args.compression
|
|
|
|
<< " compression_parallel_threads: " << args.compression_parallel_threads
|
|
|
|
<< " format_version: " << args.format_version
|
|
|
|
<< " use_mmap: " << args.use_mmap;
|
|
|
|
|
|
|
|
return os;
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::vector<TestArgs> GenerateArgList() {
|
|
|
|
std::vector<TestArgs> test_args;
|
|
|
|
std::vector<TestType> test_types = {
|
|
|
|
BLOCK_BASED_TABLE_TEST,
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
PLAIN_TABLE_SEMI_FIXED_PREFIX,
|
|
|
|
PLAIN_TABLE_FULL_STR_PREFIX,
|
|
|
|
PLAIN_TABLE_TOTAL_ORDER,
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
BLOCK_TEST,
|
|
|
|
MEMTABLE_TEST, DB_TEST};
|
|
|
|
std::vector<bool> reverse_compare_types = {false, true};
|
|
|
|
std::vector<int> restart_intervals = {16, 1, 1024};
|
|
|
|
std::vector<uint32_t> compression_parallel_threads = {1, 4};
|
|
|
|
|
|
|
|
// Only add compression if it is supported
|
|
|
|
std::vector<std::pair<CompressionType, bool>> compression_types;
|
|
|
|
compression_types.emplace_back(kNoCompression, false);
|
|
|
|
if (Snappy_Supported()) {
|
|
|
|
compression_types.emplace_back(kSnappyCompression, false);
|
|
|
|
}
|
|
|
|
if (Zlib_Supported()) {
|
|
|
|
compression_types.emplace_back(kZlibCompression, false);
|
|
|
|
compression_types.emplace_back(kZlibCompression, true);
|
|
|
|
}
|
|
|
|
if (BZip2_Supported()) {
|
|
|
|
compression_types.emplace_back(kBZip2Compression, false);
|
|
|
|
compression_types.emplace_back(kBZip2Compression, true);
|
|
|
|
}
|
|
|
|
if (LZ4_Supported()) {
|
|
|
|
compression_types.emplace_back(kLZ4Compression, false);
|
|
|
|
compression_types.emplace_back(kLZ4Compression, true);
|
|
|
|
compression_types.emplace_back(kLZ4HCCompression, false);
|
|
|
|
compression_types.emplace_back(kLZ4HCCompression, true);
|
|
|
|
}
|
|
|
|
if (XPRESS_Supported()) {
|
|
|
|
compression_types.emplace_back(kXpressCompression, false);
|
|
|
|
compression_types.emplace_back(kXpressCompression, true);
|
|
|
|
}
|
|
|
|
if (ZSTD_Supported()) {
|
|
|
|
compression_types.emplace_back(kZSTD, false);
|
|
|
|
compression_types.emplace_back(kZSTD, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto test_type : test_types) {
|
|
|
|
for (auto reverse_compare : reverse_compare_types) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
|
|
|
|
test_type == PLAIN_TABLE_FULL_STR_PREFIX ||
|
|
|
|
test_type == PLAIN_TABLE_TOTAL_ORDER) {
|
|
|
|
// Plain table doesn't use restart index or compression.
|
|
|
|
TestArgs one_arg;
|
|
|
|
one_arg.type = test_type;
|
|
|
|
one_arg.reverse_compare = reverse_compare;
|
|
|
|
one_arg.restart_interval = restart_intervals[0];
|
|
|
|
one_arg.compression = compression_types[0].first;
|
|
|
|
one_arg.compression_parallel_threads = 1;
|
|
|
|
one_arg.format_version = 0;
|
|
|
|
one_arg.use_mmap = true;
|
|
|
|
test_args.push_back(one_arg);
|
|
|
|
one_arg.use_mmap = false;
|
|
|
|
test_args.push_back(one_arg);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
for (auto restart_interval : restart_intervals) {
|
|
|
|
for (auto compression_type : compression_types) {
|
|
|
|
for (auto num_threads : compression_parallel_threads) {
|
|
|
|
TestArgs one_arg;
|
|
|
|
one_arg.type = test_type;
|
|
|
|
one_arg.reverse_compare = reverse_compare;
|
|
|
|
one_arg.restart_interval = restart_interval;
|
|
|
|
one_arg.compression = compression_type.first;
|
|
|
|
one_arg.compression_parallel_threads = num_threads;
|
|
|
|
one_arg.format_version = compression_type.second ? 2 : 1;
|
|
|
|
one_arg.use_mmap = false;
|
|
|
|
test_args.push_back(one_arg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return test_args;
|
|
|
|
}
|
|
|
|
|
|
|
|
// In order to make all tests run for plain table format, including
|
|
|
|
// those operating on empty keys, create a new prefix transformer which
|
|
|
|
// return fixed prefix if the slice is not shorter than the prefix length,
|
|
|
|
// and the full slice if it is shorter.
|
|
|
|
class FixedOrLessPrefixTransform : public SliceTransform {
|
|
|
|
private:
|
|
|
|
const size_t prefix_len_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
explicit FixedOrLessPrefixTransform(size_t prefix_len) :
|
|
|
|
prefix_len_(prefix_len) {
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* Name() const override { return "rocksdb.FixedPrefix"; }
|
|
|
|
|
|
|
|
Slice Transform(const Slice& src) const override {
|
|
|
|
assert(InDomain(src));
|
|
|
|
if (src.size() < prefix_len_) {
|
|
|
|
return src;
|
|
|
|
}
|
|
|
|
return Slice(src.data(), prefix_len_);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool InDomain(const Slice& /*src*/) const override { return true; }
|
|
|
|
|
|
|
|
bool InRange(const Slice& dst) const override {
|
|
|
|
return (dst.size() <= prefix_len_);
|
|
|
|
}
|
|
|
|
bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
|
|
|
|
};
|
|
|
|
|
|
|
|
class HarnessTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
explicit HarnessTest(const TestArgs& args)
|
|
|
|
: args_(args),
|
|
|
|
ioptions_(options_),
|
|
|
|
moptions_(options_),
|
|
|
|
write_buffer_(options_.db_write_buffer_size),
|
|
|
|
support_prev_(true),
|
|
|
|
only_support_prefix_seek_(false) {
|
|
|
|
options_.compression = args_.compression;
|
|
|
|
options_.compression_opts.parallel_threads =
|
|
|
|
args_.compression_parallel_threads;
|
|
|
|
// Use shorter block size for tests to exercise block boundary
|
|
|
|
// conditions more.
|
|
|
|
if (args_.reverse_compare) {
|
|
|
|
options_.comparator = &reverse_key_comparator;
|
|
|
|
}
|
|
|
|
|
|
|
|
internal_comparator_.reset(
|
|
|
|
new test::PlainInternalKeyComparator(options_.comparator));
|
|
|
|
|
|
|
|
options_.allow_mmap_reads = args_.use_mmap;
|
|
|
|
switch (args_.type) {
|
|
|
|
case BLOCK_BASED_TABLE_TEST:
|
|
|
|
table_options_.flush_block_policy_factory.reset(
|
|
|
|
new FlushBlockBySizePolicyFactory());
|
|
|
|
table_options_.block_size = 256;
|
|
|
|
table_options_.block_restart_interval = args_.restart_interval;
|
|
|
|
table_options_.index_block_restart_interval = args_.restart_interval;
|
|
|
|
table_options_.format_version = args_.format_version;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
|
|
|
break;
|
|
|
|
// Plain table is not supported in ROCKSDB_LITE
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
case PLAIN_TABLE_SEMI_FIXED_PREFIX:
|
|
|
|
support_prev_ = false;
|
|
|
|
only_support_prefix_seek_ = true;
|
|
|
|
options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
|
|
|
|
options_.table_factory.reset(NewPlainTableFactory());
|
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
|
|
|
break;
|
|
|
|
case PLAIN_TABLE_FULL_STR_PREFIX:
|
|
|
|
support_prev_ = false;
|
|
|
|
only_support_prefix_seek_ = true;
|
|
|
|
options_.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
options_.table_factory.reset(NewPlainTableFactory());
|
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
|
|
|
break;
|
|
|
|
case PLAIN_TABLE_TOTAL_ORDER:
|
|
|
|
support_prev_ = false;
|
|
|
|
only_support_prefix_seek_ = false;
|
|
|
|
options_.prefix_extractor = nullptr;
|
|
|
|
|
|
|
|
{
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = kPlainTableVariableLength;
|
|
|
|
plain_table_options.bloom_bits_per_key = 0;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
|
|
|
|
options_.table_factory.reset(
|
|
|
|
NewPlainTableFactory(plain_table_options));
|
|
|
|
}
|
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
|
|
|
break;
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
case BLOCK_TEST:
|
|
|
|
table_options_.block_size = 256;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
|
|
|
constructor_.reset(new BlockConstructor(options_.comparator));
|
|
|
|
break;
|
|
|
|
case MEMTABLE_TEST:
|
|
|
|
table_options_.block_size = 256;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
|
|
|
constructor_.reset(
|
|
|
|
new MemTableConstructor(options_.comparator, &write_buffer_));
|
|
|
|
break;
|
|
|
|
case DB_TEST:
|
|
|
|
table_options_.block_size = 256;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
|
|
|
constructor_.reset(new DBConstructor(options_.comparator));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ioptions_ = ImmutableOptions(options_);
|
|
|
|
moptions_ = MutableCFOptions(options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Add(const std::string& key, const std::string& value) {
|
|
|
|
constructor_->Add(key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Test(Random* rnd) {
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap data;
|
|
|
|
constructor_->Finish(options_, ioptions_, moptions_, table_options_,
|
|
|
|
*internal_comparator_, &keys, &data);
|
|
|
|
|
|
|
|
TestForwardScan(keys, data);
|
|
|
|
if (support_prev_) {
|
|
|
|
TestBackwardScan(keys, data);
|
|
|
|
}
|
|
|
|
TestRandomAccess(rnd, keys, data);
|
|
|
|
}
|
|
|
|
|
|
|
|
void TestForwardScan(const std::vector<std::string>& /*keys*/,
|
|
|
|
const stl_wrappers::KVMap& data) {
|
|
|
|
InternalIterator* iter = constructor_->NewIterator();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
for (stl_wrappers::KVMap::const_iterator model_iter = data.begin();
|
|
|
|
model_iter != data.end(); ++model_iter) {
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
|
|
|
|
iter->~InternalIterator();
|
|
|
|
} else {
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void TestBackwardScan(const std::vector<std::string>& /*keys*/,
|
|
|
|
const stl_wrappers::KVMap& data) {
|
|
|
|
InternalIterator* iter = constructor_->NewIterator();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin();
|
|
|
|
model_iter != data.rend(); ++model_iter) {
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
|
|
|
|
iter->~InternalIterator();
|
|
|
|
} else {
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
|
|
|
|
const stl_wrappers::KVMap& data) {
|
|
|
|
static const bool kVerbose = false;
|
|
|
|
InternalIterator* iter = constructor_->NewIterator();
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
stl_wrappers::KVMap::const_iterator model_iter = data.begin();
|
|
|
|
if (kVerbose) fprintf(stderr, "---\n");
|
|
|
|
for (int i = 0; i < 200; i++) {
|
|
|
|
const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
|
|
|
|
switch (toss) {
|
|
|
|
case 0: {
|
|
|
|
if (iter->Valid()) {
|
|
|
|
if (kVerbose) fprintf(stderr, "Next\n");
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
++model_iter;
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 1: {
|
|
|
|
if (kVerbose) fprintf(stderr, "SeekToFirst\n");
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
model_iter = data.begin();
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 2: {
|
|
|
|
std::string key = PickRandomKey(rnd, keys);
|
|
|
|
model_iter = data.lower_bound(key);
|
|
|
|
if (kVerbose) fprintf(stderr, "Seek '%s'\n",
|
|
|
|
EscapeString(key).c_str());
|
|
|
|
iter->Seek(Slice(key));
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 3: {
|
|
|
|
if (iter->Valid()) {
|
|
|
|
if (kVerbose) fprintf(stderr, "Prev\n");
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
if (model_iter == data.begin()) {
|
|
|
|
model_iter = data.end(); // Wrap around to invalid value
|
|
|
|
} else {
|
|
|
|
--model_iter;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 4: {
|
|
|
|
if (kVerbose) fprintf(stderr, "SeekToLast\n");
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
if (keys.empty()) {
|
|
|
|
model_iter = data.end();
|
|
|
|
} else {
|
|
|
|
std::string last = data.rbegin()->first;
|
|
|
|
model_iter = data.lower_bound(last);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
|
|
|
|
iter->~InternalIterator();
|
|
|
|
} else {
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ToString(const stl_wrappers::KVMap& data,
|
|
|
|
const stl_wrappers::KVMap::const_iterator& it) {
|
|
|
|
if (it == data.end()) {
|
|
|
|
return "END";
|
|
|
|
} else {
|
|
|
|
return "'" + it->first + "->" + it->second + "'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ToString(const stl_wrappers::KVMap& data,
|
|
|
|
const stl_wrappers::KVMap::const_reverse_iterator& it) {
|
|
|
|
if (it == data.rend()) {
|
|
|
|
return "END";
|
|
|
|
} else {
|
|
|
|
return "'" + it->first + "->" + it->second + "'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ToString(const InternalIterator* it) {
|
|
|
|
if (!it->Valid()) {
|
|
|
|
return "END";
|
|
|
|
} else {
|
|
|
|
return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
|
|
|
|
if (keys.empty()) {
|
|
|
|
return "foo";
|
|
|
|
} else {
|
|
|
|
const int index = rnd->Uniform(static_cast<int>(keys.size()));
|
|
|
|
std::string result = keys[index];
|
|
|
|
switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
|
|
|
|
case 0:
|
|
|
|
// Return an existing key
|
|
|
|
break;
|
|
|
|
case 1: {
|
|
|
|
// Attempt to return something smaller than an existing key
|
|
|
|
if (result.size() > 0 && result[result.size() - 1] > '\0'
|
|
|
|
&& (!only_support_prefix_seek_
|
|
|
|
|| options_.prefix_extractor->Transform(result).size()
|
|
|
|
< result.size())) {
|
|
|
|
result[result.size() - 1]--;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 2: {
|
|
|
|
// Return something larger than an existing key
|
|
|
|
Increment(options_.comparator, &result);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns nullptr if not running against a DB
|
|
|
|
DB* db() const { return constructor_->db(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
TestArgs args_;
|
|
|
|
Options options_;
|
|
|
|
ImmutableOptions ioptions_;
|
|
|
|
MutableCFOptions moptions_;
|
|
|
|
BlockBasedTableOptions table_options_;
|
|
|
|
std::unique_ptr<Constructor> constructor_;
|
|
|
|
WriteBufferManager write_buffer_;
|
|
|
|
bool support_prev_;
|
|
|
|
bool only_support_prefix_seek_;
|
|
|
|
std::shared_ptr<InternalKeyComparator> internal_comparator_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class ParameterizedHarnessTest : public HarnessTest,
|
|
|
|
public testing::WithParamInterface<TestArgs> {
|
|
|
|
public:
|
|
|
|
ParameterizedHarnessTest() : HarnessTest(GetParam()) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(TableTest, ParameterizedHarnessTest,
|
|
|
|
::testing::ValuesIn(GenerateArgList()));
|
|
|
|
|
|
|
|
class DBHarnessTest : public HarnessTest {
|
|
|
|
public:
|
|
|
|
DBHarnessTest()
|
|
|
|
: HarnessTest(TestArgs{DB_TEST, /* reverse_compare */ false,
|
|
|
|
/* restart_interval */ 16, kNoCompression,
|
|
|
|
/* compression_parallel_threads */ 1,
|
|
|
|
/* format_version */ 0, /* use_mmap */ false}) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
|
|
|
|
bool result = (val >= low) && (val <= high);
|
|
|
|
if (!result) {
|
|
|
|
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
|
|
|
|
(unsigned long long)(val),
|
|
|
|
(unsigned long long)(low),
|
|
|
|
(unsigned long long)(high));
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests against all kinds of tables
|
|
|
|
class TableTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
const InternalKeyComparator& GetPlainInternalComparator(
|
|
|
|
const Comparator* comp) {
|
|
|
|
if (!plain_internal_comparator) {
|
|
|
|
plain_internal_comparator.reset(
|
|
|
|
new test::PlainInternalKeyComparator(comp));
|
|
|
|
}
|
|
|
|
return *plain_internal_comparator;
|
|
|
|
}
|
|
|
|
void IndexTest(BlockBasedTableOptions table_options);
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
|
|
|
|
};
|
|
|
|
|
|
|
|
class GeneralTableTest : public TableTest {};
|
|
|
|
class BlockBasedTableTest
|
|
|
|
: public TableTest,
|
|
|
|
virtual public ::testing::WithParamInterface<uint32_t> {
|
|
|
|
public:
|
|
|
|
BlockBasedTableTest() : format_(GetParam()) {
|
|
|
|
env_ = ROCKSDB_NAMESPACE::Env::Default();
|
|
|
|
}
|
|
|
|
|
|
|
|
BlockBasedTableOptions GetBlockBasedTableOptions() {
|
|
|
|
BlockBasedTableOptions options;
|
|
|
|
options.format_version = format_;
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetupTracingTest(TableConstructor* c) {
|
|
|
|
test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
|
|
|
|
EXPECT_OK(env_->CreateDir(test_path_));
|
|
|
|
trace_file_path_ = test_path_ + "/block_cache_trace_file";
|
|
|
|
TraceOptions trace_opt;
|
|
|
|
std::unique_ptr<TraceWriter> trace_writer;
|
|
|
|
EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
|
|
|
|
&trace_writer));
|
|
|
|
// Always return Status::OK().
|
|
|
|
assert(c->block_cache_tracer_
|
|
|
|
.StartTrace(env_->GetSystemClock().get(), trace_opt,
|
|
|
|
std::move(trace_writer))
|
|
|
|
.ok());
|
|
|
|
{
|
|
|
|
std::string user_key = "k01";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c->Add(encoded_key, kDummyValue);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
std::string user_key = "k02";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c->Add(encoded_key, kDummyValue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyBlockAccessTrace(
|
|
|
|
TableConstructor* c,
|
|
|
|
const std::vector<BlockCacheTraceRecord>& expected_records) {
|
|
|
|
c->block_cache_tracer_.EndTrace();
|
|
|
|
|
|
|
|
{
|
|
|
|
std::unique_ptr<TraceReader> trace_reader;
|
|
|
|
Status s =
|
|
|
|
NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
|
|
|
|
EXPECT_OK(s);
|
|
|
|
BlockCacheTraceReader reader(std::move(trace_reader));
|
|
|
|
BlockCacheTraceHeader header;
|
|
|
|
EXPECT_OK(reader.ReadHeader(&header));
|
|
|
|
uint32_t index = 0;
|
|
|
|
while (s.ok()) {
|
|
|
|
BlockCacheTraceRecord access;
|
|
|
|
s = reader.ReadAccess(&access);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ASSERT_LT(index, expected_records.size());
|
|
|
|
EXPECT_NE("", access.block_key);
|
|
|
|
EXPECT_EQ(access.block_type, expected_records[index].block_type);
|
|
|
|
EXPECT_GT(access.block_size, 0);
|
|
|
|
EXPECT_EQ(access.caller, expected_records[index].caller);
|
|
|
|
EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
|
|
|
|
EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
|
|
|
|
// Get
|
|
|
|
if (access.caller == TableReaderCaller::kUserGet) {
|
|
|
|
EXPECT_EQ(access.referenced_key,
|
|
|
|
expected_records[index].referenced_key);
|
|
|
|
EXPECT_EQ(access.get_id, expected_records[index].get_id);
|
|
|
|
EXPECT_EQ(access.get_from_user_specified_snapshot,
|
|
|
|
expected_records[index].get_from_user_specified_snapshot);
|
|
|
|
if (access.block_type == TraceType::kBlockTraceDataBlock) {
|
|
|
|
EXPECT_GT(access.referenced_data_size, 0);
|
|
|
|
EXPECT_GT(access.num_keys_in_block, 0);
|
|
|
|
EXPECT_EQ(access.referenced_key_exist_in_block,
|
|
|
|
expected_records[index].referenced_key_exist_in_block);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
EXPECT_EQ(access.referenced_key, "");
|
|
|
|
EXPECT_EQ(access.get_id, 0);
|
|
|
|
EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse);
|
|
|
|
EXPECT_EQ(access.referenced_data_size, 0);
|
|
|
|
EXPECT_EQ(access.num_keys_in_block, 0);
|
|
|
|
EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse);
|
|
|
|
}
|
|
|
|
index++;
|
|
|
|
}
|
|
|
|
EXPECT_EQ(index, expected_records.size());
|
|
|
|
}
|
|
|
|
EXPECT_OK(env_->DeleteFile(trace_file_path_));
|
|
|
|
EXPECT_OK(env_->DeleteDir(test_path_));
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
uint64_t IndexUncompressedHelper(bool indexCompress);
|
|
|
|
|
|
|
|
private:
|
|
|
|
uint32_t format_;
|
|
|
|
Env* env_;
|
|
|
|
std::string trace_file_path_;
|
|
|
|
std::string test_path_;
|
|
|
|
};
|
|
|
|
class PlainTableTest : public TableTest {};
|
|
|
|
class TablePropertyTest : public testing::Test {};
|
|
|
|
class BBTTailPrefetchTest : public TableTest {};
|
|
|
|
|
|
|
|
// The helper class to test the file checksum
|
|
|
|
class FileChecksumTestHelper {
|
|
|
|
public:
|
|
|
|
FileChecksumTestHelper(bool convert_to_internal_key = false)
|
|
|
|
: convert_to_internal_key_(convert_to_internal_key) {
|
|
|
|
}
|
|
|
|
~FileChecksumTestHelper() {}
|
|
|
|
|
|
|
|
void CreateWriteableFile() {
|
|
|
|
sink_ = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink_);
|
|
|
|
file_writer_.reset(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) {
|
|
|
|
if (file_writer_ != nullptr) {
|
|
|
|
file_writer_->TEST_SetFileChecksumGenerator(checksum_generator);
|
|
|
|
} else {
|
|
|
|
delete checksum_generator;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
WritableFileWriter* GetFileWriter() { return file_writer_.get(); }
|
|
|
|
|
|
|
|
Status ResetTableBuilder(std::unique_ptr<TableBuilder>&& builder) {
|
|
|
|
assert(builder != nullptr);
|
|
|
|
table_builder_ = std::move(builder);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void AddKVtoKVMap(int num_entries) {
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
for (int i = 0; i < num_entries; i++) {
|
|
|
|
std::string v = rnd.RandomString(100);
|
|
|
|
kv_map_[test::RandomKey(&rnd, 20)] = v;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteKVAndFlushTable() {
|
|
|
|
for (const auto& kv : kv_map_) {
|
|
|
|
if (convert_to_internal_key_) {
|
|
|
|
ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
table_builder_->Add(encoded, kv.second);
|
|
|
|
} else {
|
|
|
|
table_builder_->Add(kv.first, kv.second);
|
|
|
|
}
|
|
|
|
EXPECT_TRUE(table_builder_->status().ok());
|
|
|
|
}
|
|
|
|
Status s = table_builder_->Finish();
|
|
|
|
EXPECT_OK(file_writer_->Flush());
|
|
|
|
EXPECT_OK(s);
|
|
|
|
|
|
|
|
EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string GetFileChecksum() {
|
|
|
|
EXPECT_OK(file_writer_->Close());
|
|
|
|
return table_builder_->GetFileChecksum();
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* GetFileChecksumFuncName() {
|
|
|
|
return table_builder_->GetFileChecksumFuncName();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CalculateFileChecksum(FileChecksumGenerator* file_checksum_generator,
|
|
|
|
std::string* checksum) {
|
|
|
|
assert(file_checksum_generator != nullptr);
|
|
|
|
cur_uniq_id_ = checksum_uniq_id_++;
|
|
|
|
test::StringSink* ss_rw =
|
|
|
|
static_cast<test::StringSink*>(file_writer_->writable_file());
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss_rw->contents()));
|
|
|
|
file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
|
|
|
|
std::unique_ptr<char[]> scratch(new char[2048]);
|
|
|
|
Slice result;
|
|
|
|
uint64_t offset = 0;
|
|
|
|
Status s;
|
|
|
|
s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
|
|
|
|
nullptr, false);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
while (result.size() != 0) {
|
|
|
|
file_checksum_generator->Update(scratch.get(), result.size());
|
|
|
|
offset += static_cast<uint64_t>(result.size());
|
|
|
|
s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
|
|
|
|
nullptr, false);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPECT_EQ(offset, static_cast<uint64_t>(table_builder_->FileSize()));
|
|
|
|
file_checksum_generator->Finalize();
|
|
|
|
*checksum = file_checksum_generator->GetChecksum();
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
bool convert_to_internal_key_;
|
|
|
|
uint64_t cur_uniq_id_;
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer_;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader_;
|
|
|
|
std::unique_ptr<TableBuilder> table_builder_;
|
|
|
|
stl_wrappers::KVMap kv_map_;
|
|
|
|
test::StringSink* sink_ = nullptr;
|
|
|
|
|
|
|
|
static uint64_t checksum_uniq_id_;
|
|
|
|
};
|
|
|
|
|
|
|
|
uint64_t FileChecksumTestHelper::checksum_uniq_id_ = 1;
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
|
|
|
|
testing::ValuesIn(test::kFooterFormatVersionsToTest));
|
|
|
|
|
|
|
|
// This test serves as the living tutorial for the prefix scan of user collected
|
|
|
|
// properties.
|
|
|
|
TEST_F(TablePropertyTest, PrefixScanTest) {
|
|
|
|
UserCollectedProperties props{{"num.111.1", "1"},
|
|
|
|
{"num.111.2", "2"},
|
|
|
|
{"num.111.3", "3"},
|
|
|
|
{"num.333.1", "1"},
|
|
|
|
{"num.333.2", "2"},
|
|
|
|
{"num.333.3", "3"},
|
|
|
|
{"num.555.1", "1"},
|
|
|
|
{"num.555.2", "2"},
|
|
|
|
{"num.555.3", "3"}, };
|
|
|
|
|
|
|
|
// prefixes that exist
|
|
|
|
for (const std::string prefix : {"num.111", "num.333", "num.555"}) {
|
|
|
|
int num = 0;
|
|
|
|
for (auto pos = props.lower_bound(prefix);
|
|
|
|
pos != props.end() &&
|
|
|
|
pos->first.compare(0, prefix.size(), prefix) == 0;
|
|
|
|
++pos) {
|
|
|
|
++num;
|
|
|
|
auto key = prefix + "." + ToString(num);
|
|
|
|
ASSERT_EQ(key, pos->first);
|
|
|
|
ASSERT_EQ(ToString(num), pos->second);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(3, num);
|
|
|
|
}
|
|
|
|
|
|
|
|
// prefixes that don't exist
|
|
|
|
for (const std::string prefix :
|
|
|
|
{"num.000", "num.222", "num.444", "num.666"}) {
|
|
|
|
auto pos = props.lower_bound(prefix);
|
|
|
|
ASSERT_TRUE(pos == props.end() ||
|
|
|
|
pos->first.compare(0, prefix.size(), prefix) != 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
namespace {
|
|
|
|
struct TestIds {
|
|
|
|
UniqueId64x3 internal_id;
|
|
|
|
UniqueId64x3 external_id;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline bool operator==(const TestIds& lhs, const TestIds& rhs) {
|
|
|
|
return lhs.internal_id == rhs.internal_id &&
|
|
|
|
lhs.external_id == rhs.external_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& os, const TestIds& ids) {
|
|
|
|
return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x"
|
|
|
|
<< ids.internal_id[1] << "U, 0x" << ids.internal_id[2]
|
|
|
|
<< "U }}, {{ 0x" << ids.external_id[0] << "U, 0x"
|
|
|
|
<< ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}";
|
|
|
|
}
|
|
|
|
|
|
|
|
TestIds GetUniqueId(TableProperties* tp, std::unordered_set<uint64_t>* seen,
|
|
|
|
const std::string& db_id, const std::string& db_session_id,
|
|
|
|
uint64_t file_number) {
|
|
|
|
// First test session id logic
|
|
|
|
if (db_session_id.size() == 20) {
|
|
|
|
uint64_t upper;
|
|
|
|
uint64_t lower;
|
|
|
|
EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower));
|
|
|
|
EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get external using public API
|
|
|
|
tp->db_id = db_id;
|
|
|
|
tp->db_session_id = db_session_id;
|
|
|
|
tp->orig_file_number = file_number;
|
|
|
|
TestIds t;
|
|
|
|
{
|
|
|
|
std::string uid;
|
|
|
|
EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid));
|
|
|
|
EXPECT_EQ(uid.size(), 24U);
|
|
|
|
t.external_id[0] = DecodeFixed64(&uid[0]);
|
|
|
|
t.external_id[1] = DecodeFixed64(&uid[8]);
|
|
|
|
t.external_id[2] = DecodeFixed64(&uid[16]);
|
|
|
|
}
|
|
|
|
// All these should be effectively random
|
|
|
|
EXPECT_TRUE(seen->insert(t.external_id[0]).second);
|
|
|
|
EXPECT_TRUE(seen->insert(t.external_id[1]).second);
|
|
|
|
EXPECT_TRUE(seen->insert(t.external_id[2]).second);
|
|
|
|
|
|
|
|
// Get internal with internal API
|
|
|
|
EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number,
|
|
|
|
&t.internal_id));
|
|
|
|
|
|
|
|
// Verify relationship
|
|
|
|
UniqueId64x3 tmp = t.internal_id;
|
|
|
|
InternalUniqueIdToExternal(&tmp);
|
|
|
|
EXPECT_EQ(tmp, t.external_id);
|
|
|
|
ExternalUniqueIdToInternal(&tmp);
|
|
|
|
EXPECT_EQ(tmp, t.internal_id);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) {
|
|
|
|
// To ensure the computation only depends on the expected entries, we set
|
|
|
|
// the rest randomly
|
|
|
|
TableProperties tp;
|
|
|
|
TEST_SetRandomTableProperties(&tp);
|
|
|
|
|
|
|
|
// DB id is normally RFC-4122
|
|
|
|
const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
|
|
|
|
// Allow other forms of DB id
|
|
|
|
const std::string db_id2 = "1728000184588763620";
|
|
|
|
const std::string db_id3 = "x";
|
|
|
|
|
|
|
|
// DB session id is normally 20 chars in base-36, but 13 to 24 chars
|
|
|
|
// is ok, roughly 64 to 128 bits.
|
|
|
|
const std::string ses_id1 = "ABCDEFGHIJ0123456789";
|
|
|
|
// Same trailing 13 digits
|
|
|
|
const std::string ses_id2 = "HIJ0123456789";
|
|
|
|
const std::string ses_id3 = "0123ABCDEFGHIJ0123456789";
|
|
|
|
// Different trailing 12 digits
|
|
|
|
const std::string ses_id4 = "ABCDEFGH888888888888";
|
|
|
|
// And change length
|
|
|
|
const std::string ses_id5 = "ABCDEFGHIJ012";
|
|
|
|
const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD";
|
|
|
|
|
|
|
|
using T = TestIds;
|
|
|
|
std::unordered_set<uint64_t> seen;
|
|
|
|
// Establish a stable schema for the unique IDs. These values must not
|
|
|
|
// change for existing table files.
|
|
|
|
// (Note: parens needed for macro parsing, extra braces needed for some
|
|
|
|
// compilers.)
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id1, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
|
|
|
|
{{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}}));
|
|
|
|
// Only change internal_id[1] with file number
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id1, 2),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}},
|
|
|
|
{{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}},
|
|
|
|
{{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}}));
|
|
|
|
// Change internal_id[1] and internal_id[2] with db_id
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id2, ses_id1, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}},
|
|
|
|
{{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id3, ses_id1, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}},
|
|
|
|
{{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}}));
|
|
|
|
// Keeping same last 13 digits of ses_id keeps same internal_id[0]
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id2, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}},
|
|
|
|
{{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id3, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}},
|
|
|
|
{{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}}));
|
|
|
|
// Changing last 12 digits of ses_id only changes internal_id[0]
|
|
|
|
// (vs. db_id1, ses_id1, 1)
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id4, 1),
|
|
|
|
T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
|
|
|
|
{{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}}));
|
|
|
|
// ses_id can change everything.
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id5, 1),
|
|
|
|
T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}},
|
|
|
|
{{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id6, 1),
|
|
|
|
T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}},
|
|
|
|
{{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}}));
|
|
|
|
|
|
|
|
// Now verify more thoroughly that any small change in inputs completely
|
|
|
|
// changes external unique id.
|
|
|
|
// (Relying on 'seen' checks etc. in GetUniqueId)
|
|
|
|
std::string db_id = "00000000-0000-0000-0000-000000000000";
|
|
|
|
std::string ses_id = "000000000000000000000000";
|
|
|
|
uint64_t file_num = 1;
|
|
|
|
// change db_id
|
|
|
|
for (size_t i = 0; i < db_id.size(); ++i) {
|
|
|
|
if (db_id[i] == '-') {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
for (char alt : std::string("123456789abcdef")) {
|
|
|
|
db_id[i] = alt;
|
|
|
|
GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
|
|
|
|
}
|
|
|
|
db_id[i] = '0';
|
|
|
|
}
|
|
|
|
// change ses_id
|
|
|
|
for (size_t i = 0; i < ses_id.size(); ++i) {
|
|
|
|
for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) {
|
|
|
|
ses_id[i] = alt;
|
|
|
|
GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
|
|
|
|
}
|
|
|
|
ses_id[i] = '0';
|
|
|
|
}
|
|
|
|
// change file_num
|
|
|
|
for (int i = 1; i < 64; ++i) {
|
|
|
|
GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify that "all zeros" in first 128 bits is equivalent for internal and
|
|
|
|
// external IDs. This way, as long as we avoid "all zeros" in internal IDs,
|
|
|
|
// we avoid it in external IDs.
|
|
|
|
{
|
|
|
|
UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}};
|
|
|
|
UniqueId64x3 id2 = id1;
|
|
|
|
InternalUniqueIdToExternal(&id1);
|
|
|
|
EXPECT_EQ(id1, id2);
|
|
|
|
ExternalUniqueIdToInternal(&id2);
|
|
|
|
EXPECT_EQ(id1, id2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
void SetGoodTableProperties(TableProperties* tp) {
|
|
|
|
// To ensure the computation only depends on the expected entries, we set
|
|
|
|
// the rest randomly
|
|
|
|
TEST_SetRandomTableProperties(tp);
|
|
|
|
tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
|
|
|
|
tp->db_session_id = "ABCDEFGHIJ0123456789";
|
|
|
|
tp->orig_file_number = 1;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_F(TablePropertyTest, UniqueIdHumanStrings) {
|
|
|
|
TableProperties tp;
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
|
|
|
|
std::string tmp;
|
|
|
|
EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp));
|
|
|
|
EXPECT_EQ(tmp,
|
|
|
|
(std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23',
|
|
|
|
'\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3',
|
|
|
|
'\x03', '\x93', '\x08', '\xca', '\x17', '\x28',
|
|
|
|
'\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}}));
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp),
|
|
|
|
"6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B");
|
|
|
|
|
|
|
|
// including zero padding
|
|
|
|
tmp = std::string(24U, '\0');
|
|
|
|
tmp[15] = '\x12';
|
|
|
|
tmp[23] = '\xAB';
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp),
|
|
|
|
"0000000000000000-0000000000000012-00000000000000AB");
|
|
|
|
|
|
|
|
// And shortened
|
|
|
|
tmp = std::string(20U, '\0');
|
|
|
|
tmp[5] = '\x12';
|
|
|
|
tmp[10] = '\xAB';
|
|
|
|
tmp[17] = '\xEF';
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp),
|
|
|
|
"0000000000120000-0000AB0000000000-00EF0000");
|
|
|
|
|
|
|
|
tmp.resize(16);
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000");
|
|
|
|
|
|
|
|
tmp.resize(11);
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB");
|
|
|
|
|
|
|
|
tmp.resize(6);
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012");
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(TablePropertyTest, UniqueIdsFailure) {
|
|
|
|
TableProperties tp;
|
|
|
|
std::string tmp;
|
|
|
|
|
|
|
|
// Missing DB id
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
tp.db_id = "";
|
|
|
|
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
|
|
|
|
|
|
|
// Missing session id
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
tp.db_session_id = "";
|
|
|
|
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
|
|
|
|
|
|
|
// Missing file number
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
tp.orig_file_number = 0;
|
|
|
|
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
|
|
|
}
|
|
|
|
|
|
|
|
// This test include all the basic checks except those for index size and block
|
|
|
|
// size, which will be conducted in separated unit tests.
|
|
|
|
TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
|
|
|
|
c.Add("a1", "val1");
|
|
|
|
c.Add("b2", "val2");
|
|
|
|
c.Add("c3", "val3");
|
|
|
|
c.Add("d4", "val4");
|
|
|
|
c.Add("e5", "val5");
|
|
|
|
c.Add("f6", "val6");
|
|
|
|
c.Add("g7", "val7");
|
|
|
|
c.Add("h8", "val8");
|
|
|
|
c.Add("j9", "val9");
|
|
|
|
uint64_t diff_internal_user_bytes = 9 * 8; // 8 is seq size, 9 k-v totally
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
options.statistics->set_stats_level(StatsLevel::kAll);
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0);
|
|
|
|
|
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
|
|
|
ASSERT_EQ(kvmap.size(), props.num_entries);
|
|
|
|
|
|
|
|
auto raw_key_size = kvmap.size() * 2ul;
|
|
|
|
auto raw_value_size = kvmap.size() * 4ul;
|
|
|
|
|
|
|
|
ASSERT_EQ(raw_key_size + diff_internal_user_bytes, props.raw_key_size);
|
|
|
|
ASSERT_EQ(raw_value_size, props.raw_value_size);
|
|
|
|
ASSERT_EQ(1ul, props.num_data_blocks);
|
|
|
|
ASSERT_EQ("", props.filter_policy_name); // no filter policy is used
|
|
|
|
|
|
|
|
// Verify data size.
|
|
|
|
BlockBuilder block_builder(1);
|
|
|
|
for (const auto& item : kvmap) {
|
|
|
|
block_builder.Add(item.first, item.second);
|
|
|
|
}
|
|
|
|
Slice content = block_builder.Finish();
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
ASSERT_EQ(content.size() + BlockBasedTable::kBlockTrailerSize +
|
|
|
|
diff_internal_user_bytes,
|
|
|
|
props.data_size);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef SNAPPY
|
|
|
|
uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
constexpr size_t kNumKeys = 10000;
|
|
|
|
|
|
|
|
for (size_t k = 0; k < kNumKeys; ++k) {
|
|
|
|
c.Add("key" + ToString(k), "val" + ToString(k));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
options.statistics->set_stats_level(StatsLevel::kAll);
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
table_options.enable_index_compression = compressed;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
c.ResetTableReader();
|
|
|
|
return options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
|
|
|
|
}
|
|
|
|
TEST_P(BlockBasedTableTest, IndexUncompressed) {
|
|
|
|
uint64_t tbl1_compressed_cnt = IndexUncompressedHelper(true);
|
|
|
|
uint64_t tbl2_compressed_cnt = IndexUncompressedHelper(false);
|
|
|
|
// tbl1_compressed_cnt should include 1 index block
|
|
|
|
EXPECT_EQ(tbl2_compressed_cnt + 1, tbl1_compressed_cnt);
|
|
|
|
}
|
|
|
|
#endif // SNAPPY
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
|
|
|
|
TableConstructor c(&reverse_key_comparator);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
|
|
|
|
{
|
|
|
|
Options options;
|
|
|
|
options.compression = CompressionType::kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
|
|
|
|
|
|
|
// Default comparator
|
|
|
|
ASSERT_EQ("leveldb.BytewiseComparator", props.comparator_name);
|
|
|
|
// No merge operator
|
|
|
|
ASSERT_EQ("nullptr", props.merge_operator_name);
|
|
|
|
// No prefix extractor
|
|
|
|
ASSERT_EQ("nullptr", props.prefix_extractor_name);
|
|
|
|
// No property collectors
|
|
|
|
ASSERT_EQ("[]", props.property_collectors_names);
|
|
|
|
// No filter policy is used
|
|
|
|
ASSERT_EQ("", props.filter_policy_name);
|
|
|
|
// Compression type == that set:
|
|
|
|
ASSERT_EQ("NoCompression", props.compression_name);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.comparator = &reverse_key_comparator;
|
|
|
|
options.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
|
|
|
options.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
options.table_properties_collector_factories.emplace_back(
|
|
|
|
new DummyPropertiesCollectorFactory1());
|
|
|
|
options.table_properties_collector_factories.emplace_back(
|
|
|
|
new DummyPropertiesCollectorFactory2());
|
|
|
|
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
|
|
|
|
|
|
|
ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name);
|
|
|
|
ASSERT_EQ("UInt64AddOperator", props.merge_operator_name);
|
|
|
|
ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name);
|
|
|
|
ASSERT_EQ(
|
|
|
|
"[DummyPropertiesCollectorFactory1,DummyPropertiesCollectorFactory2]",
|
|
|
|
props.property_collectors_names);
|
|
|
|
ASSERT_EQ("", props.filter_policy_name); // no filter policy is used
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, RangeDelBlock) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
std::vector<std::string> keys = {"1pika", "2chu"};
|
|
|
|
std::vector<std::string> vals = {"p", "c"};
|
|
|
|
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
6 years ago
|
|
|
std::vector<RangeTombstone> expected_tombstones = {
|
|
|
|
{"1pika", "2chu", 0},
|
|
|
|
{"2chu", "c", 1},
|
|
|
|
{"2chu", "c", 0},
|
|
|
|
{"c", "p", 0},
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
RangeTombstone t(keys[i], vals[i], i);
|
|
|
|
std::pair<InternalKey, Slice> p = t.Serialize();
|
|
|
|
c.Add(p.first.Encode().ToString(), p.second);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> sorted_keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
std::unique_ptr<InternalKeyComparator> internal_cmp(
|
|
|
|
new InternalKeyComparator(options.comparator));
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *internal_cmp,
|
|
|
|
&sorted_keys, &kvmap);
|
|
|
|
|
|
|
|
for (int j = 0; j < 2; ++j) {
|
|
|
|
std::unique_ptr<InternalIterator> iter(
|
|
|
|
c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions()));
|
|
|
|
if (j > 0) {
|
|
|
|
// For second iteration, delete the table reader object and verify the
|
|
|
|
// iterator can still access its metablock's range tombstones.
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
6 years ago
|
|
|
for (size_t i = 0; i < expected_tombstones.size(); i++) {
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ParsedInternalKey parsed_key;
|
|
|
|
ASSERT_OK(
|
|
|
|
ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */));
|
|
|
|
RangeTombstone t(parsed_key, iter->value());
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
6 years ago
|
|
|
const auto& expected_t = expected_tombstones[i];
|
|
|
|
ASSERT_EQ(t.start_key_, expected_t.start_key_);
|
|
|
|
ASSERT_EQ(t.end_key_, expected_t.end_key_);
|
|
|
|
ASSERT_EQ(t.seq_, expected_t.seq_);
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("a1", "val1");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
|
|
|
Options options;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
|
|
|
ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// BlockBasedTableTest::PrefetchTest
|
|
|
|
//
|
|
|
|
void AssertKeysInCache(BlockBasedTable* table_reader,
|
|
|
|
const std::vector<std::string>& keys_in_cache,
|
|
|
|
const std::vector<std::string>& keys_not_in_cache,
|
|
|
|
bool convert = false) {
|
|
|
|
if (convert) {
|
|
|
|
for (auto key : keys_in_cache) {
|
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
|
|
|
}
|
|
|
|
for (auto key : keys_not_in_cache) {
|
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (auto key : keys_in_cache) {
|
|
|
|
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
|
|
|
|
}
|
|
|
|
for (auto key : keys_not_in_cache) {
|
|
|
|
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void PrefetchRange(TableConstructor* c, Options* opt,
|
|
|
|
BlockBasedTableOptions* table_options, const char* key_begin,
|
|
|
|
const char* key_end,
|
|
|
|
const std::vector<std::string>& keys_in_cache,
|
|
|
|
const std::vector<std::string>& keys_not_in_cache,
|
|
|
|
const Status expected_status = Status::OK()) {
|
|
|
|
// reset the cache and reopen the table
|
|
|
|
table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
|
|
|
opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
|
|
|
|
const ImmutableOptions ioptions2(*opt);
|
|
|
|
const MutableCFOptions moptions(*opt);
|
|
|
|
ASSERT_OK(c->Reopen(ioptions2, moptions));
|
|
|
|
|
|
|
|
// prefetch
|
|
|
|
auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
|
|
|
|
Status s;
|
|
|
|
std::unique_ptr<Slice> begin, end;
|
|
|
|
std::unique_ptr<InternalKey> i_begin, i_end;
|
|
|
|
if (key_begin != nullptr) {
|
|
|
|
if (c->ConvertToInternalKey()) {
|
|
|
|
i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue));
|
|
|
|
begin.reset(new Slice(i_begin->Encode()));
|
|
|
|
} else {
|
|
|
|
begin.reset(new Slice(key_begin));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (key_end != nullptr) {
|
|
|
|
if (c->ConvertToInternalKey()) {
|
|
|
|
i_end.reset(new InternalKey(key_end, kMaxSequenceNumber, kTypeValue));
|
|
|
|
end.reset(new Slice(i_end->Encode()));
|
|
|
|
} else {
|
|
|
|
end.reset(new Slice(key_end));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
s = table_reader->Prefetch(begin.get(), end.get());
|
|
|
|
|
|
|
|
ASSERT_TRUE(s.code() == expected_status.code());
|
|
|
|
|
|
|
|
// assert our expectation in cache warmup
|
|
|
|
AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache,
|
|
|
|
c->ConvertToInternalKey());
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c->ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, PrefetchTest) {
|
|
|
|
// The purpose of this test is to test the prefetching operation built into
|
|
|
|
// BlockBasedTable.
|
|
|
|
Options opt;
|
|
|
|
std::unique_ptr<InternalKeyComparator> ikc;
|
|
|
|
ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
|
|
|
|
opt.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
// big enough so we don't ever lose cached values.
|
|
|
|
table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(opt);
|
|
|
|
const MutableCFOptions moptions(opt);
|
|
|
|
c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
|
|
|
|
// We get the following data spread :
|
|
|
|
//
|
|
|
|
// Data block Index
|
|
|
|
// ========================
|
|
|
|
// [ k01 k02 k03 ] k03
|
|
|
|
// [ k04 ] k04
|
|
|
|
// [ k05 ] k05
|
|
|
|
// [ k06 k07 ] k07
|
|
|
|
|
|
|
|
|
|
|
|
// Simple
|
|
|
|
PrefetchRange(&c, &opt, &table_options,
|
|
|
|
/*key_range=*/"k01", "k05",
|
|
|
|
/*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"},
|
|
|
|
/*keys_not_in_cache=*/{"k06", "k07"});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"},
|
|
|
|
{"k04", "k05", "k06", "k07"});
|
|
|
|
// odd
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "a", "z",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k00", "k00", {"k01", "k02", "k03"},
|
|
|
|
{"k04", "k05", "k06", "k07"});
|
|
|
|
// Edge cases
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k00", "k06",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k00", "zzz",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
|
|
|
// null keys
|
|
|
|
PrefetchRange(&c, &opt, &table_options, nullptr, nullptr,
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k04", nullptr,
|
|
|
|
{"k04", "k05", "k06", "k07"}, {"k01", "k02", "k03"});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, nullptr, "k05",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05"}, {"k06", "k07"});
|
|
|
|
// invalid
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k06", "k00", {}, {},
|
|
|
|
Status::InvalidArgument(Slice("k06 "), Slice("k07")));
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
for (int i = 0; i <= 5; ++i) {
|
|
|
|
Options options;
|
|
|
|
// Make each key/value an individual block
|
|
|
|
table_options.block_size = 64;
|
|
|
|
switch (i) {
|
|
|
|
case 0:
|
|
|
|
// Binary search index
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearch;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
// Hash search index
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(4));
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
// Hash search index with hash_index_allow_collision
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
table_options.hash_index_allow_collision = true;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(4));
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
// Hash search index with filter policy
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(4));
|
|
|
|
break;
|
|
|
|
case 4:
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
// Two-level index
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
break;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
case 5:
|
|
|
|
// Binary search with first key
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::kBinarySearchWithFirstKey;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(),
|
|
|
|
true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("aaaa1", std::string('a', 56));
|
|
|
|
c.Add("bbaa1", std::string('a', 56));
|
|
|
|
c.Add("cccc1", std::string('a', 56));
|
|
|
|
c.Add("bbbb1", std::string('a', 56));
|
|
|
|
c.Add("baaa1", std::string('a', 56));
|
|
|
|
c.Add("abbb1", std::string('a', 56));
|
|
|
|
c.Add("cccc2", std::string('a', 56));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
auto props = c.GetTableReader()->GetTableProperties();
|
|
|
|
ASSERT_EQ(7u, props->num_data_blocks);
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
|
|
|
|
iter->Seek(InternalKey("bb", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
|
|
|
|
iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, NoopTransformSeek) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
// To tickle the PrefixMayMatch bug it is important that the
|
|
|
|
// user-key is a single byte so that the index key exactly matches
|
|
|
|
// the user-key.
|
|
|
|
InternalKey key("a", 1, kTypeValue);
|
|
|
|
c.Add(key.Encode().ToString(), "b");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = (i == 0);
|
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
iter->Seek(key.Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("a", ExtractUserKey(iter->key()).ToString());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
|
|
|
|
// if DB is opened with a prefix extractor of a different name,
|
|
|
|
// prefix bloom is skipped when read the file
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(2));
|
|
|
|
table_options.whole_key_filtering = false;
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
InternalKey key("abcdefghijk", 1, kTypeValue);
|
|
|
|
c.Add(key.Encode().ToString(), "test");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
// TODO(Zhongyi): update test to use MutableCFOptions
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(9));
|
|
|
|
const ImmutableOptions new_ioptions(options);
|
|
|
|
const MutableCFOptions new_moptions(options);
|
|
|
|
ASSERT_OK(c.Reopen(new_ioptions, new_moptions));
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
|
|
|
|
read_options, new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Test point lookup
|
|
|
|
// only one kv
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
db_iter->Seek(kv.first);
|
|
|
|
ASSERT_TRUE(db_iter->Valid());
|
|
|
|
ASSERT_OK(db_iter->status());
|
|
|
|
ASSERT_EQ(db_iter->key(), kv.first);
|
|
|
|
ASSERT_EQ(db_iter->value(), kv.second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
TEST_P(BlockBasedTableTest, BadChecksumType) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
InternalKey key("abc", 1, kTypeValue);
|
|
|
|
c.Add(key.Encode().ToString(), "test");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
|
|
|
// Corrupt checksum type (123 is invalid)
|
|
|
|
auto& sink = *c.TEST_GetSink();
|
|
|
|
size_t len = sink.contents_.size();
|
|
|
|
ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength], kCRC32c);
|
|
|
|
sink.contents_[len - Footer::kNewVersionsEncodedLength] = char{123};
|
|
|
|
|
|
|
|
// (Re-)Open table file with bad checksum type
|
|
|
|
const ImmutableOptions new_ioptions(options);
|
|
|
|
const MutableCFOptions new_moptions(options);
|
|
|
|
Status s = c.Reopen(new_ioptions, new_moptions);
|
|
|
|
ASSERT_NOK(s);
|
|
|
|
ASSERT_EQ(s.ToString(),
|
|
|
|
"Corruption: Corrupt or unsupported checksum type: 123");
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
std::string ChecksumAsString(const std::string& data,
|
|
|
|
ChecksumType checksum_type) {
|
|
|
|
uint32_t v = ComputeBuiltinChecksum(checksum_type, data.data(), data.size());
|
|
|
|
|
|
|
|
// Verify consistency with other function
|
|
|
|
if (data.size() >= 1) {
|
|
|
|
EXPECT_EQ(v, ComputeBuiltinChecksumWithLastByte(
|
|
|
|
checksum_type, data.data(), data.size() - 1, data.back()));
|
|
|
|
}
|
|
|
|
// Little endian as in file
|
|
|
|
std::array<char, 4> raw_bytes;
|
|
|
|
EncodeFixed32(raw_bytes.data(), v);
|
|
|
|
return Slice(raw_bytes.data(), raw_bytes.size()).ToString(/*hex*/ true);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ChecksumAsString(std::string* data, char new_last_byte,
|
|
|
|
ChecksumType checksum_type) {
|
|
|
|
data->back() = new_last_byte;
|
|
|
|
return ChecksumAsString(*data, checksum_type);
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Make sure that checksum values don't change in later versions, even if
|
|
|
|
// consistent within current version.
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
TEST_P(BlockBasedTableTest, ChecksumSchemas) {
|
|
|
|
std::string b0 = "x";
|
|
|
|
std::string b1 = "This is a short block!x";
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
std::string b2;
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
b2.append("This is a long block!");
|
|
|
|
}
|
|
|
|
b2.append("x");
|
|
|
|
// Trailing 'x' will be replaced by compression type
|
|
|
|
|
|
|
|
std::string empty;
|
|
|
|
|
|
|
|
char ct1 = kNoCompression;
|
|
|
|
char ct2 = kSnappyCompression;
|
|
|
|
char ct3 = kZSTD;
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
|
|
|
|
// Note: first byte of trailer is compression type, last 4 are checksum
|
|
|
|
|
|
|
|
for (ChecksumType t : GetSupportedChecksums()) {
|
|
|
|
switch (t) {
|
|
|
|
case kNoChecksum:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000");
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
break;
|
|
|
|
case kCRC32c:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63");
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
break;
|
|
|
|
case kxxHash:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338");
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
break;
|
|
|
|
case kxxHash64:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1");
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
break;
|
|
|
|
case kXXH3:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE");
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// Force this test to be updated on new ChecksumTypes
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void AddInternalKey(TableConstructor* c, const std::string& prefix,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
std::string value = "v", int /*suffix_len*/ = 800) {
|
|
|
|
static Random rnd(1023);
|
|
|
|
InternalKey k(prefix + rnd.RandomString(800), 0, kTypeValue);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
c->Add(k.Encode().ToString(), value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void TableTest::IndexTest(BlockBasedTableOptions table_options) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
|
|
|
|
// keys with prefix length 3, make sure the key/value is big enough to fill
|
|
|
|
// one block
|
|
|
|
AddInternalKey(&c, "0015");
|
|
|
|
AddInternalKey(&c, "0035");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0054");
|
|
|
|
AddInternalKey(&c, "0055");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0056");
|
|
|
|
AddInternalKey(&c, "0057");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0058");
|
|
|
|
AddInternalKey(&c, "0075");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0076");
|
|
|
|
AddInternalKey(&c, "0095");
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
|
|
|
|
table_options.block_size = 1700;
|
|
|
|
table_options.block_cache = NewLRUCache(1024, 4);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
|
|
|
|
auto props = reader->GetTableProperties();
|
|
|
|
ASSERT_EQ(5u, props->num_data_blocks);
|
|
|
|
|
|
|
|
// TODO(Zhongyi): update test to use MutableCFOptions
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// -- Find keys do not exist, but have common prefix.
|
|
|
|
std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
|
|
|
|
std::vector<std::string> lower_bound = {
|
|
|
|
keys[0], keys[1], keys[2], keys[7], keys[9],
|
|
|
|
};
|
|
|
|
|
|
|
|
// find the lower bound of the prefix
|
|
|
|
for (size_t i = 0; i < prefixes.size(); ++i) {
|
|
|
|
index_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(index_iter->status());
|
|
|
|
ASSERT_TRUE(index_iter->Valid());
|
|
|
|
|
|
|
|
// seek the first element in the block
|
|
|
|
ASSERT_EQ(lower_bound[i], index_iter->key().ToString());
|
|
|
|
ASSERT_EQ("v", index_iter->value().ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
// find the upper bound of prefixes
|
|
|
|
std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], };
|
|
|
|
|
|
|
|
// find existing keys
|
|
|
|
for (const auto& item : kvmap) {
|
|
|
|
auto ukey = ExtractUserKey(item.first).ToString();
|
|
|
|
index_iter->Seek(ukey);
|
|
|
|
|
|
|
|
// ASSERT_OK(regular_iter->status());
|
|
|
|
ASSERT_OK(index_iter->status());
|
|
|
|
|
|
|
|
// ASSERT_TRUE(regular_iter->Valid());
|
|
|
|
ASSERT_TRUE(index_iter->Valid());
|
|
|
|
|
|
|
|
ASSERT_EQ(item.first, index_iter->key().ToString());
|
|
|
|
ASSERT_EQ(item.second, index_iter->value().ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < prefixes.size(); ++i) {
|
|
|
|
// the key is greater than any existing keys.
|
|
|
|
auto key = prefixes[i] + "9";
|
|
|
|
index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
|
|
|
|
|
|
|
|
ASSERT_TRUE(index_iter->status().ok() || index_iter->status().IsNotFound());
|
|
|
|
ASSERT_TRUE(!index_iter->status().IsNotFound() || !index_iter->Valid());
|
|
|
|
if (i == prefixes.size() - 1) {
|
|
|
|
// last key
|
|
|
|
ASSERT_TRUE(!index_iter->Valid());
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(index_iter->Valid());
|
|
|
|
// seek the first element in the block
|
|
|
|
ASSERT_EQ(upper_bound[i], index_iter->key().ToString());
|
|
|
|
ASSERT_EQ("v", index_iter->value().ToString());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// find keys with prefix that don't match any of the existing prefixes.
|
|
|
|
std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
|
|
|
|
for (const auto& prefix : non_exist_prefixes) {
|
|
|
|
index_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
|
|
|
|
// regular_iter->Seek(prefix);
|
|
|
|
|
|
|
|
ASSERT_OK(index_iter->status());
|
|
|
|
// Seek to non-existing prefixes should yield either invalid, or a
|
|
|
|
// key with prefix greater than the target.
|
|
|
|
if (index_iter->Valid()) {
|
|
|
|
Slice ukey = ExtractUserKey(index_iter->key());
|
|
|
|
Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
|
|
|
|
ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (const auto& prefix : non_exist_prefixes) {
|
|
|
|
index_iter->SeekForPrev(InternalKey(prefix, 0, kTypeValue).Encode());
|
|
|
|
// regular_iter->Seek(prefix);
|
|
|
|
|
|
|
|
ASSERT_OK(index_iter->status());
|
|
|
|
// Seek to non-existing prefixes should yield either invalid, or a
|
|
|
|
// key with prefix greater than the target.
|
|
|
|
if (index_iter->Valid()) {
|
|
|
|
Slice ukey = ExtractUserKey(index_iter->key());
|
|
|
|
Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
|
|
|
|
ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Test reseek case. It should impact partitioned index more.
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
std::unique_ptr<InternalIterator> index_iter2(reader->NewIterator(
|
|
|
|
ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Things to cover in partitioned index:
|
|
|
|
// 1. Both of Seek() and SeekToLast() has optimization to prevent
|
|
|
|
// rereek leaf index block if it remains to the same one, and
|
|
|
|
// they reuse the same variable.
|
|
|
|
// 2. When Next() or Prev() is called, the block moves, so the
|
|
|
|
// optimization should kick in only with the current one.
|
|
|
|
index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0075", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Next();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Next();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
}
|
|
|
|
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexTest) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearch;
|
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, HashIndexTest) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, PartitionIndexTest) {
|
|
|
|
const int max_index_keys = 5;
|
|
|
|
const int est_max_index_key_value_size = 32;
|
|
|
|
const int est_max_index_size = max_index_keys * est_max_index_key_value_size;
|
|
|
|
for (int i = 1; i <= est_max_index_size + 1; i++) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
|
|
|
|
table_options.metadata_block_size = i;
|
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
Options options;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
AddInternalKey(&c, "pika");
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
ASSERT_EQ(1, keys.size());
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
ReadOptions ropt;
|
|
|
|
ropt.read_tier = ReadTier::kBlockCacheTier;
|
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
auto ikey = [](Slice user_key) {
|
|
|
|
return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
|
|
|
|
};
|
|
|
|
|
|
|
|
iter->Seek(ikey("pika"));
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
|
|
|
|
// This used to crash at some point.
|
|
|
|
iter->Seek(ikey("pika"));
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
}
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
|
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
class CustomFlushBlockPolicy : public FlushBlockPolicyFactory,
|
|
|
|
public FlushBlockPolicy {
|
|
|
|
public:
|
|
|
|
explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
|
|
|
|
: keys_per_block_(keys_per_block) {}
|
|
|
|
|
|
|
|
const char* Name() const override { return "CustomFlushBlockPolicy"; }
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
|
|
|
|
const BlockBuilder&) const override {
|
|
|
|
return new CustomFlushBlockPolicy(keys_per_block_);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Update(const Slice&, const Slice&) override {
|
|
|
|
if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) {
|
|
|
|
++current_block_idx_;
|
|
|
|
keys_in_current_block_ = 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
++keys_in_current_block_;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<int> keys_per_block_;
|
|
|
|
|
|
|
|
int current_block_idx_ = 0;
|
|
|
|
int keys_in_current_block_ = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
|
|
|
|
for (int use_first_key = 0; use_first_key < 2; ++use_first_key) {
|
|
|
|
SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key));
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type =
|
|
|
|
use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey
|
|
|
|
: BlockBasedTableOptions::kBinarySearch;
|
|
|
|
table_options.block_cache = NewLRUCache(10000); // fits all blocks
|
|
|
|
table_options.index_shortening =
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<CustomFlushBlockPolicy>(std::vector<int>{2, 1, 3, 2});
|
|
|
|
Options options;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
Statistics* stats = options.statistics.get();
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
const ImmutableOptions ioptions(options);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
|
|
|
|
// Block 0.
|
|
|
|
AddInternalKey(&c, "aaaa", "v0");
|
|
|
|
AddInternalKey(&c, "aaac", "v1");
|
|
|
|
|
|
|
|
// Block 1.
|
|
|
|
AddInternalKey(&c, "aaca", "v2");
|
|
|
|
|
|
|
|
// Block 2.
|
|
|
|
AddInternalKey(&c, "caaa", "v3");
|
|
|
|
AddInternalKey(&c, "caac", "v4");
|
|
|
|
AddInternalKey(&c, "caae", "v5");
|
|
|
|
|
|
|
|
// Block 3.
|
|
|
|
AddInternalKey(&c, "ccaa", "v6");
|
|
|
|
AddInternalKey(&c, "ccac", "v7");
|
|
|
|
|
|
|
|
// Write the file.
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
ASSERT_EQ(8, keys.size());
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
auto props = reader->GetTableProperties();
|
|
|
|
ASSERT_EQ(4u, props->num_data_blocks);
|
|
|
|
ReadOptions read_options;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized,
|
|
|
|
/*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
|
|
|
|
// Shouldn't have read data blocks before iterator is seeked.
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
auto ikey = [](Slice user_key) {
|
|
|
|
return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
|
|
|
|
};
|
|
|
|
|
|
|
|
// Seek to a key between blocks. If index contains first key, we shouldn't
|
|
|
|
// read any data blocks until value is requested.
|
|
|
|
iter->Seek(ikey("aaba"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 0 : 1,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v2", iter->value().ToString());
|
|
|
|
EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to the middle of a block. The block should be read right away.
|
|
|
|
iter->Seek(ikey("caab"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[4], iter->key().ToString());
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v4", iter->value().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to just before the same block and don't access value.
|
|
|
|
// The iterator should keep pinning the block contents.
|
|
|
|
iter->Seek(ikey("baaa"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[3], iter->key().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to the same block again to check that the block is still pinned.
|
|
|
|
iter->Seek(ikey("caae"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[5], iter->key().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v5", iter->value().ToString());
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Step forward and fall through to the next block. Don't access value.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[6], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 3,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Step forward again. Block should be read.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[7], iter->key().ToString());
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v7", iter->value().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Step forward and reach the end.
|
|
|
|
iter->Next();
|
|
|
|
EXPECT_FALSE(iter->Valid());
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to a single-key block and step forward without accessing value.
|
|
|
|
iter->Seek(ikey("aaca"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 0 : 1,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[3], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 1 : 2,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v3", iter->value().ToString());
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
|
|
|
// Seek between blocks and step back without accessing value.
|
|
|
|
iter->Seek(ikey("aaca"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 3,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[1], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 3,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
// All blocks are in cache now, there'll be no more misses ever.
|
|
|
|
EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v1", iter->value().ToString());
|
|
|
|
|
|
|
|
// Next into the next block again.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 4,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to first and step back without accessing value.
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[0], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 5,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
EXPECT_FALSE(iter->Valid());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 5,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Do some SeekForPrev() and SeekToLast() just to cover all methods.
|
|
|
|
iter->SeekForPrev(ikey("caad"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[4], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 3 : 6,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v4", iter->value().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 3 : 6,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[7], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 4 : 7,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("v7", iter->value().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 4 : 7,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
|
|
|
|
table_options.block_cache = NewLRUCache(10000);
|
|
|
|
Options options;
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
Statistics* stats = options.statistics.get();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
const ImmutableOptions ioptions(options);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
|
|
|
|
/* level */ -1, /* largest_seqno */ 42);
|
|
|
|
|
|
|
|
c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x");
|
|
|
|
c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y");
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
ASSERT_EQ(2, keys.size());
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
auto props = reader->GetTableProperties();
|
|
|
|
ASSERT_EQ(1u, props->num_data_blocks);
|
|
|
|
ReadOptions read_options;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized,
|
|
|
|
/*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
|
|
|
|
iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
|
|
|
|
iter->key().ToString());
|
|
|
|
EXPECT_NE(keys[0], iter->key().ToString());
|
|
|
|
// Key should have been served from index, without reading data blocks.
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
EXPECT_EQ("x", iter->value().ToString());
|
|
|
|
EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
|
|
|
|
iter->key().ToString());
|
|
|
|
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
// It's very hard to figure out the index block size of a block accurately.
|
|
|
|
// To make sure we get the index size, we just make sure as key number
|
|
|
|
// grows, the filter block size also grows.
|
|
|
|
TEST_P(BlockBasedTableTest, IndexSizeStat) {
|
|
|
|
uint64_t last_index_size = 0;
|
|
|
|
|
|
|
|
// we need to use random keys since the pure human readable texts
|
|
|
|
// may be well compressed, resulting insignifcant change of index
|
|
|
|
// block size.
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
keys.push_back(rnd.RandomString(10000));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Each time we load one more key to the table. the table index block
|
|
|
|
// size is expected to be larger than last time's.
|
|
|
|
for (size_t i = 1; i < keys.size(); ++i) {
|
|
|
|
TableConstructor c(BytewiseComparator(),
|
|
|
|
true /* convert_to_internal_key_ */);
|
|
|
|
for (size_t j = 0; j < i; ++j) {
|
|
|
|
c.Add(keys[j], "val");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> ks;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &ks, &kvmap);
|
|
|
|
auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
|
|
|
|
ASSERT_GT(index_size, last_index_size);
|
|
|
|
last_index_size = index_size;
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, NumBlockStat) {
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
table_options.block_size = 1000;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; ++i) {
|
|
|
|
// the key/val are slightly smaller than block size, so that each block
|
|
|
|
// holds roughly one key/value pair.
|
|
|
|
c.Add(rnd.RandomString(900), "val");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> ks;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &ks, &kvmap);
|
|
|
|
ASSERT_EQ(kvmap.size(),
|
|
|
|
c.GetTableReader()->GetTableProperties()->num_data_blocks);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, TracingGetTest) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
SetupTracingTest(&c);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
std::string user_key = "k01";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
for (uint32_t i = 1; i <= 2; i++) {
|
|
|
|
PinnableSlice value;
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, true, nullptr, nullptr, nullptr, nullptr,
|
|
|
|
nullptr, nullptr, /*tracing_get_id=*/i);
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
|
|
ASSERT_EQ(value.ToString(), kDummyValue);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify traces.
|
|
|
|
std::vector<BlockCacheTraceRecord> expected_records;
|
|
|
|
// The first two records should be prefetching index and filter blocks.
|
|
|
|
BlockCacheTraceRecord record;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kPrefetch;
|
|
|
|
record.is_cache_hit = Boolean::kFalse;
|
|
|
|
record.no_insert = Boolean::kFalse;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// Then we should have three records for one index, one filter, and one data
|
|
|
|
// block access.
|
|
|
|
record.get_id = 1;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kUserGet;
|
|
|
|
record.get_from_user_specified_snapshot = Boolean::kFalse;
|
|
|
|
record.referenced_key = encoded_key;
|
|
|
|
record.referenced_key_exist_in_block = Boolean::kTrue;
|
|
|
|
record.is_cache_hit = Boolean::kTrue;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.is_cache_hit = Boolean::kFalse;
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// The second get should all observe cache hits.
|
|
|
|
record.is_cache_hit = Boolean::kTrue;
|
|
|
|
record.get_id = 2;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kUserGet;
|
|
|
|
record.get_from_user_specified_snapshot = Boolean::kFalse;
|
|
|
|
record.referenced_key = encoded_key;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
SetupTracingTest(&c);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
for (uint32_t i = 1; i <= 2; i++) {
|
|
|
|
std::string user_key = "k01";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c.GetTableReader()->ApproximateOffsetOf(
|
|
|
|
encoded_key, TableReaderCaller::kUserApproximateSize);
|
|
|
|
}
|
|
|
|
// Verify traces.
|
|
|
|
std::vector<BlockCacheTraceRecord> expected_records;
|
|
|
|
// The first two records should be prefetching index and filter blocks.
|
|
|
|
BlockCacheTraceRecord record;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kPrefetch;
|
|
|
|
record.is_cache_hit = Boolean::kFalse;
|
|
|
|
record.no_insert = Boolean::kFalse;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// Then we should have two records for only index blocks.
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kUserApproximateSize;
|
|
|
|
record.is_cache_hit = Boolean::kTrue;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
expected_records.push_back(record);
|
|
|
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, TracingIterator) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
SetupTracingTest(&c);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
for (uint32_t i = 1; i <= 2; i++) {
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUserIterator));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->key();
|
|
|
|
iter->value();
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
iter.reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify traces.
|
|
|
|
std::vector<BlockCacheTraceRecord> expected_records;
|
|
|
|
// The first two records should be prefetching index and filter blocks.
|
|
|
|
BlockCacheTraceRecord record;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kPrefetch;
|
|
|
|
record.is_cache_hit = Boolean::kFalse;
|
|
|
|
record.no_insert = Boolean::kFalse;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// Then we should have three records for index and two data block access.
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kUserIterator;
|
|
|
|
record.is_cache_hit = Boolean::kTrue;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
record.is_cache_hit = Boolean::kFalse;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// When we iterate this file for the second time, we should observe all cache
|
|
|
|
// hits.
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.is_cache_hit = Boolean::kTrue;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
expected_records.push_back(record);
|
|
|
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
// A simple tool that takes the snapshot of block cache statistics.
|
|
|
|
class BlockCachePropertiesSnapshot {
|
|
|
|
public:
|
|
|
|
explicit BlockCachePropertiesSnapshot(Statistics* statistics) {
|
|
|
|
block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
|
|
|
|
block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
|
|
|
|
index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
|
|
|
|
index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
|
|
|
|
data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
|
|
|
|
data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
|
|
|
|
filter_block_cache_miss =
|
|
|
|
statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
|
|
|
|
filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
|
|
|
|
block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
|
|
|
|
block_cache_bytes_write =
|
|
|
|
statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
|
|
|
|
int64_t expected_index_block_cache_hit) {
|
|
|
|
ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
|
|
|
|
}
|
|
|
|
|
|
|
|
void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss,
|
|
|
|
int64_t expected_filter_block_cache_hit) {
|
|
|
|
ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the fetched props matches the expected ones.
|
|
|
|
// TODO(kailiu) Use this only when you disabled filter policy!
|
|
|
|
void AssertEqual(int64_t expected_index_block_cache_miss,
|
|
|
|
int64_t expected_index_block_cache_hit,
|
|
|
|
int64_t expected_data_block_cache_miss,
|
|
|
|
int64_t expected_data_block_cache_hit) const {
|
|
|
|
ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
|
|
|
|
ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss,
|
|
|
|
block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit,
|
|
|
|
block_cache_hit);
|
|
|
|
}
|
|
|
|
|
|
|
|
int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
|
|
|
|
|
|
|
|
int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
int64_t block_cache_miss = 0;
|
|
|
|
int64_t block_cache_hit = 0;
|
|
|
|
int64_t index_block_cache_miss = 0;
|
|
|
|
int64_t index_block_cache_hit = 0;
|
|
|
|
int64_t data_block_cache_miss = 0;
|
|
|
|
int64_t data_block_cache_hit = 0;
|
|
|
|
int64_t filter_block_cache_miss = 0;
|
|
|
|
int64_t filter_block_cache_hit = 0;
|
|
|
|
int64_t block_cache_bytes_read = 0;
|
|
|
|
int64_t block_cache_bytes_write = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
|
|
|
|
// use block cache to store them).
|
|
|
|
TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_cache = NewLRUCache(1024, 4);
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("key", "value");
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
// preloading filter/index blocks is enabled.
|
|
|
|
auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
ASSERT_FALSE(reader->TEST_FilterBlockInCache());
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
6 years ago
|
|
|
ASSERT_FALSE(reader->TEST_IndexBlockInCache());
|
|
|
|
|
|
|
|
{
|
|
|
|
// nothing happens in the beginning
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertIndexBlockStat(0, 0);
|
|
|
|
props.AssertFilterBlockStat(0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, Slice(), nullptr, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, true, nullptr, nullptr);
|
|
|
|
// a hack that just to trigger BlockBasedTable::GetFilter.
|
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), "non-exist-key", &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertIndexBlockStat(0, 0);
|
|
|
|
props.AssertFilterBlockStat(0, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Due to the difficulities of the intersaction between statistics, this test
|
|
|
|
// only tests the case when "index block is put to block cache"
|
|
|
|
TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
|
|
|
|
// -- Table construction
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
|
|
|
|
// Enable the cache for index/filter blocks
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
LRUCacheOptions co;
|
|
|
|
co.capacity = 2048;
|
|
|
|
co.num_shard_bits = 2;
|
|
|
|
co.metadata_charge_policy = kDontChargeCacheMetadata;
|
|
|
|
table_options.block_cache = NewLRUCache(co);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("key", "value");
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
// preloading filter/index blocks is prohibited.
|
|
|
|
auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
ASSERT_FALSE(reader->TEST_FilterBlockInCache());
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
6 years ago
|
|
|
ASSERT_TRUE(reader->TEST_IndexBlockInCache());
|
|
|
|
|
|
|
|
// -- PART 1: Open with regular block cache.
|
|
|
|
// Since block_cache is disabled, no cache activities will be involved.
|
|
|
|
std::unique_ptr<InternalIterator> iter;
|
|
|
|
|
|
|
|
int64_t last_cache_bytes_read = 0;
|
|
|
|
// At first, no block will be accessed.
|
|
|
|
{
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
// index will be added to block cache.
|
|
|
|
props.AssertEqual(1, // index block miss
|
|
|
|
0, 0, 0);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
|
|
|
last_cache_bytes_read = props.GetCacheBytesRead();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only index block will be accessed
|
|
|
|
{
|
|
|
|
iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
// NOTE: to help better highlight the "detla" of each ticker, I use
|
|
|
|
// <last_value> + <added_value> to indicate the increment of changed
|
|
|
|
// value; other numbers remain the same.
|
|
|
|
props.AssertEqual(1, 0 + 1, // index block hit
|
|
|
|
0, 0);
|
|
|
|
// Cache hit, bytes read from cache should increase
|
|
|
|
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
|
|
|
last_cache_bytes_read = props.GetCacheBytesRead();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only data block will be accessed
|
|
|
|
{
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertEqual(1, 1, 0 + 1, // data block miss
|
|
|
|
0);
|
|
|
|
// Cache miss, Bytes read from cache should not change
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
|
|
|
last_cache_bytes_read = props.GetCacheBytesRead();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Data block will be in cache
|
|
|
|
{
|
|
|
|
iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertEqual(1, 1 + 1, /* index block hit */
|
|
|
|
1, 0 + 1 /* data block hit */);
|
|
|
|
// Cache hit, bytes read from cache should increase
|
|
|
|
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
|
|
|
}
|
|
|
|
// release the iterator so that the block cache can reset correctly.
|
|
|
|
iter.reset();
|
|
|
|
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
|
|
|
|
// -- PART 2: Open with very small block cache
|
|
|
|
// In this test, no block will ever get hit since the block cache is
|
|
|
|
// too small to fit even one entry.
|
|
|
|
table_options.block_cache = NewLRUCache(1, 4);
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
const ImmutableOptions ioptions2(options);
|
|
|
|
const MutableCFOptions moptions2(options);
|
|
|
|
ASSERT_OK(c.Reopen(ioptions2, moptions2));
|
|
|
|
{
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertEqual(1, // index block miss
|
|
|
|
0, 0, 0);
|
|
|
|
// Cache miss, Bytes read from cache should not change
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Both index and data block get accessed.
|
|
|
|
// It first cache index block then data block. But since the cache size
|
|
|
|
// is only 1, index block will be purged after data block is inserted.
|
|
|
|
iter.reset(c.NewIterator(moptions2.prefix_extractor.get()));
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertEqual(1 + 1, // index block miss
|
|
|
|
0, 0, // data block miss
|
|
|
|
0);
|
|
|
|
// Cache hit, bytes read from cache should increase
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// SeekToFirst() accesses data block. With similar reason, we expect data
|
|
|
|
// block's cache miss.
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertEqual(2, 0, 0 + 1, // data block miss
|
|
|
|
0);
|
|
|
|
// Cache miss, Bytes read from cache should not change
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
|
|
|
}
|
|
|
|
iter.reset();
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
|
|
|
|
// -- PART 3: Open table with bloom filter enabled but not in SST file
|
|
|
|
table_options.block_cache = NewLRUCache(4096, 4);
|
|
|
|
table_options.cache_index_and_filter_blocks = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c3(BytewiseComparator());
|
|
|
|
std::string user_key = "k01";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
c3.Add(internal_key.Encode().ToString(), "hello");
|
|
|
|
ImmutableOptions ioptions3(options);
|
|
|
|
MutableCFOptions moptions3(options);
|
|
|
|
// Generate table without filter policy
|
|
|
|
c3.Finish(options, ioptions3, moptions3, table_options,
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
c3.ResetTableReader();
|
|
|
|
|
|
|
|
// Open table with filter policy
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(1));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
ImmutableOptions ioptions4(options);
|
|
|
|
MutableCFOptions moptions4(options);
|
|
|
|
ASSERT_OK(c3.Reopen(ioptions4, moptions4));
|
|
|
|
reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
5 years ago
|
|
|
ASSERT_FALSE(reader->TEST_FilterBlockInCache());
|
|
|
|
PinnableSlice value;
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, true, nullptr, nullptr);
|
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
|
|
|
|
moptions4.prefix_extractor.get()));
|
|
|
|
ASSERT_STREQ(value.data(), "hello");
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertFilterBlockStat(0, 0);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c3.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ValidateBlockSizeDeviation(int value, int expected) {
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size_deviation = value;
|
|
|
|
BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
|
|
|
|
|
|
|
|
const BlockBasedTableOptions* normalized_table_options =
|
|
|
|
factory->GetOptions<BlockBasedTableOptions>();
|
|
|
|
ASSERT_EQ(normalized_table_options->block_size_deviation, expected);
|
|
|
|
|
|
|
|
delete factory;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ValidateBlockRestartInterval(int value, int expected) {
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_restart_interval = value;
|
|
|
|
BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
|
|
|
|
|
|
|
|
const BlockBasedTableOptions* normalized_table_options =
|
|
|
|
factory->GetOptions<BlockBasedTableOptions>();
|
|
|
|
ASSERT_EQ(normalized_table_options->block_restart_interval, expected);
|
|
|
|
|
|
|
|
delete factory;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, InvalidOptions) {
|
|
|
|
// invalid values for block_size_deviation (<0 or >100) are silently set to 0
|
|
|
|
ValidateBlockSizeDeviation(-10, 0);
|
|
|
|
ValidateBlockSizeDeviation(-1, 0);
|
|
|
|
ValidateBlockSizeDeviation(0, 0);
|
|
|
|
ValidateBlockSizeDeviation(1, 1);
|
|
|
|
ValidateBlockSizeDeviation(99, 99);
|
|
|
|
ValidateBlockSizeDeviation(100, 100);
|
|
|
|
ValidateBlockSizeDeviation(101, 0);
|
|
|
|
ValidateBlockSizeDeviation(1000, 0);
|
|
|
|
|
|
|
|
// invalid values for block_restart_interval (<1) are silently set to 1
|
|
|
|
ValidateBlockRestartInterval(-10, 1);
|
|
|
|
ValidateBlockRestartInterval(-1, 1);
|
|
|
|
ValidateBlockRestartInterval(0, 1);
|
|
|
|
ValidateBlockRestartInterval(1, 1);
|
|
|
|
ValidateBlockRestartInterval(2, 2);
|
|
|
|
ValidateBlockRestartInterval(1000, 1000);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BlockReadCountTest) {
|
|
|
|
// bloom_filter_type = 0 -- block-based filter
|
|
|
|
// bloom_filter_type = 0 -- full filter
|
|
|
|
for (int bloom_filter_type = 0; bloom_filter_type < 2; ++bloom_filter_type) {
|
|
|
|
for (int index_and_filter_in_cache = 0; index_and_filter_in_cache < 2;
|
|
|
|
++index_and_filter_in_cache) {
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_cache = NewLRUCache(1, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = index_and_filter_in_cache;
|
|
|
|
table_options.filter_policy.reset(
|
|
|
|
NewBloomFilterPolicy(10, bloom_filter_type == 0));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
std::string user_key = "k04";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c.Add(encoded_key, "hello");
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
// Generate table with filter policy
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
PinnableSlice value;
|
|
|
|
{
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, true, nullptr, nullptr);
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
if (index_and_filter_in_cache) {
|
|
|
|
// data, index and filter block
|
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 3);
|
|
|
|
ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
|
|
|
|
ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
|
|
|
|
} else {
|
|
|
|
// just the data block
|
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 1);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
|
|
ASSERT_STREQ(value.data(), "hello");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get non-existing key
|
|
|
|
user_key = "does-not-exist";
|
|
|
|
internal_key = InternalKey(user_key, 0, kTypeValue);
|
|
|
|
encoded_key = internal_key.Encode().ToString();
|
|
|
|
|
|
|
|
value.Reset();
|
|
|
|
{
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, true, nullptr, nullptr);
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kNotFound);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (index_and_filter_in_cache) {
|
|
|
|
if (bloom_filter_type == 0) {
|
|
|
|
// with block-based, we read index and then the filter
|
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 2);
|
|
|
|
ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
|
|
|
|
ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
|
|
|
|
} else {
|
|
|
|
// with full-filter, we read filter first and then we stop
|
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 1);
|
|
|
|
ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// filter is already in memory and it figures out that the key doesn't
|
|
|
|
// exist
|
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BlockCacheLeak) {
|
|
|
|
// Check that when we reopen a table we don't lose access to blocks already
|
|
|
|
// in the cache. This test checks whether the Table actually makes use of the
|
|
|
|
// unique ID from the file.
|
|
|
|
|
|
|
|
Options opt;
|
|
|
|
std::unique_ptr<InternalKeyComparator> ikc;
|
|
|
|
ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
|
|
|
|
opt.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
// big enough so we don't ever lose cached values.
|
|
|
|
table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(opt);
|
|
|
|
const MutableCFOptions moptions(opt);
|
|
|
|
c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> iter(
|
|
|
|
c.NewIterator(moptions.prefix_extractor.get()));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->key();
|
|
|
|
iter->value();
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_OK(iter->status());
|
Change and clarify the relationship between Valid(), status() and Seek*() for all iterators. Also fix some bugs
Summary:
Before this PR, Iterator/InternalIterator may simultaneously have non-ok status() and Valid() = true. That state means that the last operation failed, but the iterator is nevertheless positioned on some unspecified record. Likely intended uses of that are:
* If some sst files are corrupted, a normal iterator can be used to read the data from files that are not corrupted.
* When using read_tier = kBlockCacheTier, read the data that's in block cache, skipping over the data that is not.
However, this behavior wasn't documented well (and until recently the wiki on github had misleading incorrect information). In the code there's a lot of confusion about the relationship between status() and Valid(), and about whether Seek()/SeekToLast()/etc reset the status or not. There were a number of bugs caused by this confusion, both inside rocksdb and in the code that uses rocksdb (including ours).
This PR changes the convention to:
* If status() is not ok, Valid() always returns false.
* Any seek operation resets status. (Before the PR, it depended on iterator type and on particular error.)
This does sacrifice the two use cases listed above, but siying said it's ok.
Overview of the changes:
* A commit that adds missing status checks in MergingIterator. This fixes a bug that actually affects us, and we need it fixed. `DBIteratorTest.NonBlockingIterationBugRepro` explains the scenario.
* Changes to lots of iterator types to make all of them conform to the new convention. Some bug fixes along the way. By far the biggest changes are in DBIter, which is a big messy piece of code; I tried to make it less big and messy but mostly failed.
* A stress-test for DBIter, to gain some confidence that I didn't break it. It does a few million random operations on the iterator, while occasionally modifying the underlying data (like ForwardIterator does) and occasionally returning non-ok status from internal iterator.
To find the iterator types that needed changes I searched for "public .*Iterator" in the code. Here's an overview of all 27 iterator types:
Iterators that didn't need changes:
* status() is always ok(), or Valid() is always false: MemTableIterator, ModelIter, TestIterator, KVIter (2 classes with this name anonymous namespaces), LoggingForwardVectorIterator, VectorIterator, MockTableIterator, EmptyIterator, EmptyInternalIterator.
* Thin wrappers that always pass through Valid() and status(): ArenaWrappedDBIter, TtlIterator, InternalIteratorFromIterator.
Iterators with changes (see inline comments for details):
* DBIter - an overhaul:
- It used to silently skip corrupted keys (`FindParseableKey()`), which seems dangerous. This PR makes it just stop immediately after encountering a corrupted key, just like it would for other kinds of corruption. Let me know if there was actually some deeper meaning in this behavior and I should put it back.
- It had a few code paths silently discarding subiterator's status. The stress test caught a few.
- The backwards iteration code path was expecting the internal iterator's set of keys to be immutable. It's probably always true in practice at the moment, since ForwardIterator doesn't support backwards iteration, but this PR fixes it anyway. See added DBIteratorTest.ReverseToForwardBug for an example.
- Some parts of backwards iteration code path even did things like `assert(iter_->Valid())` after a seek, which is never a safe assumption.
- It used to not reset status on seek for some types of errors.
- Some simplifications and better comments.
- Some things got more complicated from the added error handling. I'm open to ideas for how to make it nicer.
* MergingIterator - check status after every operation on every subiterator, and in some places assert that valid subiterators have ok status.
* ForwardIterator - changed to the new convention, also slightly simplified.
* ForwardLevelIterator - fixed some bugs and simplified.
* LevelIterator - simplified.
* TwoLevelIterator - changed to the new convention. Also fixed a bug that would make SeekForPrev() sometimes silently ignore errors from first_level_iter_.
* BlockBasedTableIterator - minor changes.
* BlockIter - replaced `SetStatus()` with `Invalidate()` to make sure non-ok BlockIter is always invalid.
* PlainTableIterator - some seeks used to not reset status.
* CuckooTableIterator - tiny code cleanup.
* ManagedIterator - fixed some bugs.
* BaseDeltaIterator - changed to the new convention and fixed a bug.
* BlobDBIterator - seeks used to not reset status.
* KeyConvertingIterator - some small change.
Closes https://github.com/facebook/rocksdb/pull/3810
Differential Revision: D7888019
Pulled By: al13n321
fbshipit-source-id: 4aaf6d3421c545d16722a815b2fa2e7912bc851d
7 years ago
|
|
|
iter.reset();
|
|
|
|
|
|
|
|
const ImmutableOptions ioptions1(opt);
|
|
|
|
const MutableCFOptions moptions1(opt);
|
|
|
|
ASSERT_OK(c.Reopen(ioptions1, moptions1));
|
|
|
|
auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
|
|
|
for (const std::string& key : keys) {
|
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
|
|
|
}
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
|
|
|
|
// rerun with different block cache
|
|
|
|
table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
const ImmutableOptions ioptions2(opt);
|
|
|
|
const MutableCFOptions moptions2(opt);
|
|
|
|
ASSERT_OK(c.Reopen(ioptions2, moptions2));
|
|
|
|
table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
|
|
|
for (const std::string& key : keys) {
|
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
|
|
|
}
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, MemoryAllocator) {
|
|
|
|
auto default_memory_allocator = std::make_shared<DefaultMemoryAllocator>();
|
|
|
|
auto custom_memory_allocator =
|
|
|
|
std::make_shared<CountedMemoryAllocator>(default_memory_allocator);
|
|
|
|
{
|
|
|
|
Options opt;
|
|
|
|
std::unique_ptr<InternalKeyComparator> ikc;
|
|
|
|
ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
|
|
|
|
opt.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
LRUCacheOptions lruOptions;
|
|
|
|
lruOptions.memory_allocator = custom_memory_allocator;
|
|
|
|
lruOptions.capacity = 16 * 1024 * 1024;
|
|
|
|
lruOptions.num_shard_bits = 4;
|
|
|
|
table_options.block_cache = NewLRUCache(std::move(lruOptions));
|
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(),
|
|
|
|
true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(opt);
|
|
|
|
const MutableCFOptions moptions(opt);
|
|
|
|
c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> iter(
|
|
|
|
c.NewIterator(moptions.prefix_extractor.get()));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->key();
|
|
|
|
iter->value();
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
// out of scope, block cache should have been deleted, all allocations
|
|
|
|
// deallocated
|
|
|
|
EXPECT_EQ(custom_memory_allocator->GetNumAllocations(),
|
|
|
|
custom_memory_allocator->GetNumDeallocations());
|
|
|
|
// make sure that allocations actually happened through the cache allocator
|
|
|
|
EXPECT_GT(custom_memory_allocator->GetNumAllocations(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test the file checksum of block based table
|
|
|
|
TEST_P(BlockBasedTableTest, NoFileChecksum) {
|
|
|
|
Options options;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
int level = 0;
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
std::string column_family_name;
|
|
|
|
|
|
|
|
FileChecksumTestHelper f(true);
|
|
|
|
f.CreateWriteableFile();
|
|
|
|
std::unique_ptr<TableBuilder> builder;
|
|
|
|
builder.reset(ioptions.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, *comparator,
|
|
|
|
&int_tbl_prop_collector_factories,
|
|
|
|
options.compression, options.compression_opts,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
kUnknownColumnFamily, column_family_name, level),
|
|
|
|
f.GetFileWriter()));
|
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
|
|
|
f.AddKVtoKVMap(1000);
|
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
|
|
|
|
ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
|
|
|
|
FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
|
|
|
|
new FileChecksumGenCrc32cFactory();
|
|
|
|
Options options;
|
|
|
|
options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
int level = 0;
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
std::string column_family_name;
|
|
|
|
|
|
|
|
FileChecksumGenContext gen_context;
|
|
|
|
gen_context.file_name = "db/tmp";
|
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
|
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
|
|
|
FileChecksumTestHelper f(true);
|
|
|
|
f.CreateWriteableFile();
|
|
|
|
f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
|
|
|
|
std::unique_ptr<TableBuilder> builder;
|
|
|
|
builder.reset(ioptions.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, *comparator,
|
|
|
|
&int_tbl_prop_collector_factories,
|
|
|
|
options.compression, options.compression_opts,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
kUnknownColumnFamily, column_family_name, level),
|
|
|
|
f.GetFileWriter()));
|
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
|
|
|
f.AddKVtoKVMap(1000);
|
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
|
|
|
|
|
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
|
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
|
|
|
std::string checksum;
|
|
|
|
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
|
|
|
|
ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
|
|
|
|
|
|
|
|
// Unit test the generator itself for schema stability
|
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen3 =
|
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
|
|
|
const char data[] = "here is some data";
|
|
|
|
checksum_crc32c_gen3->Update(data, sizeof(data));
|
|
|
|
checksum_crc32c_gen3->Finalize();
|
|
|
|
checksum = checksum_crc32c_gen3->GetChecksum();
|
|
|
|
ASSERT_STREQ(checksum.c_str(), "\345\245\277\110");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Plain table is not supported in ROCKSDB_LITE
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
TEST_F(PlainTableTest, BasicPlainTableProperties) {
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = 8;
|
|
|
|
plain_table_options.bloom_bits_per_key = 8;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
|
|
|
|
PlainTableFactory factory(plain_table_options);
|
|
|
|
std::unique_ptr<FSWritableFile> sink(new test::StringSink());
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(sink), "" /* don't care */, FileOptions()));
|
|
|
|
Options options;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
std::string column_family_name;
|
|
|
|
int unknown_level = -1;
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kNoCompression,
|
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
|
|
|
column_family_name, unknown_level),
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
9 years ago
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
for (char c = 'a'; c <= 'z'; ++c) {
|
|
|
|
std::string key(8, c);
|
|
|
|
key.append("\1 "); // PlainTable expects internal key structure
|
|
|
|
std::string value(28, c + 42);
|
|
|
|
builder->Add(key, value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
ASSERT_OK(file_writer->Flush());
|
|
|
|
|
|
|
|
test::StringSink* ss =
|
|
|
|
static_cast<test::StringSink*>(file_writer->writable_file());
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss->contents(), 72242, true));
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
std::unique_ptr<TableProperties> props;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
9 years ago
|
|
|
auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
kPlainTableMagicNumber, ioptions, &props);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(0ul, props->index_size);
|
|
|
|
ASSERT_EQ(0ul, props->filter_size);
|
|
|
|
ASSERT_EQ(16ul * 26, props->raw_key_size);
|
|
|
|
ASSERT_EQ(28ul * 26, props->raw_value_size);
|
|
|
|
ASSERT_EQ(26ul, props->num_entries);
|
|
|
|
ASSERT_EQ(1ul, props->num_data_blocks);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(PlainTableTest, NoFileChecksum) {
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = 20;
|
|
|
|
plain_table_options.bloom_bits_per_key = 8;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
PlainTableFactory factory(plain_table_options);
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
std::string column_family_name;
|
|
|
|
int unknown_level = -1;
|
|
|
|
FileChecksumTestHelper f(true);
|
|
|
|
f.CreateWriteableFile();
|
|
|
|
|
|
|
|
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kNoCompression,
|
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
|
|
|
column_family_name, unknown_level),
|
|
|
|
f.GetFileWriter()));
|
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
|
|
|
f.AddKVtoKVMap(1000);
|
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
|
|
|
|
EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(PlainTableTest, Crc32cFileChecksum) {
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = 20;
|
|
|
|
plain_table_options.bloom_bits_per_key = 8;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
PlainTableFactory factory(plain_table_options);
|
|
|
|
|
|
|
|
FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
|
|
|
|
new FileChecksumGenCrc32cFactory();
|
|
|
|
Options options;
|
|
|
|
options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
std::string column_family_name;
|
|
|
|
int unknown_level = -1;
|
|
|
|
|
|
|
|
FileChecksumGenContext gen_context;
|
|
|
|
gen_context.file_name = "db/tmp";
|
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
|
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
|
|
|
FileChecksumTestHelper f(true);
|
|
|
|
f.CreateWriteableFile();
|
|
|
|
f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
|
|
|
|
|
|
|
|
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kNoCompression,
|
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
|
|
|
column_family_name, unknown_level),
|
|
|
|
f.GetFileWriter()));
|
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
|
|
|
f.AddKVtoKVMap(1000);
|
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
|
|
|
|
|
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
|
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
|
|
|
std::string checksum;
|
|
|
|
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
|
|
|
|
EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.db_host_id = "";
|
|
|
|
test::PlainInternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000));
|
|
|
|
// k04 and k05 will be in two consecutive blocks, the index is
|
|
|
|
// an arbitrary slice between k04 and k05, either before or after k04a
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void DoCompressionTest(CompressionType comp) {
|
|
|
|
Random rnd(301);
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
std::string tmp;
|
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
|
|
|
|
c.Add("k03", "hello3");
|
|
|
|
c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
test::PlainInternalKeyComparator ikc(options.comparator);
|
|
|
|
options.compression = comp;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
|
|
|
|
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
4 years ago
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7050));
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
|
|
|
|
std::vector<CompressionType> compression_state;
|
|
|
|
if (!Snappy_Supported()) {
|
|
|
|
fprintf(stderr, "skipping snappy compression tests\n");
|
|
|
|
} else {
|
|
|
|
compression_state.push_back(kSnappyCompression);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!Zlib_Supported()) {
|
|
|
|
fprintf(stderr, "skipping zlib compression tests\n");
|
|
|
|
} else {
|
|
|
|
compression_state.push_back(kZlibCompression);
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(kailiu) DoCompressionTest() doesn't work with BZip2.
|
|
|
|
/*
|
|
|
|
if (!BZip2_Supported()) {
|
|
|
|
fprintf(stderr, "skipping bzip2 compression tests\n");
|
|
|
|
} else {
|
|
|
|
compression_state.push_back(kBZip2Compression);
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (!LZ4_Supported()) {
|
|
|
|
fprintf(stderr, "skipping lz4 and lz4hc compression tests\n");
|
|
|
|
} else {
|
|
|
|
compression_state.push_back(kLZ4Compression);
|
|
|
|
compression_state.push_back(kLZ4HCCompression);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!XPRESS_Supported()) {
|
|
|
|
fprintf(stderr, "skipping xpress and xpress compression tests\n");
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
compression_state.push_back(kXpressCompression);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto state : compression_state) {
|
|
|
|
DoCompressionTest(state);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
TEST_P(ParameterizedHarnessTest, RandomizedHarnessTest) {
|
|
|
|
Random rnd(test::RandomSeed() + 5);
|
|
|
|
for (int num_entries = 0; num_entries < 2000;
|
|
|
|
num_entries += (num_entries < 50 ? 1 : 200)) {
|
|
|
|
for (int e = 0; e < num_entries; e++) {
|
|
|
|
Add(test::RandomKey(&rnd, rnd.Skewed(4)),
|
|
|
|
rnd.RandomString(rnd.Skewed(5)));
|
|
|
|
}
|
|
|
|
Test(&rnd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
TEST_F(DBHarnessTest, RandomizedLongDB) {
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
int num_entries = 100000;
|
|
|
|
for (int e = 0; e < num_entries; e++) {
|
|
|
|
std::string v;
|
|
|
|
Add(test::RandomKey(&rnd, rnd.Skewed(4)), rnd.RandomString(rnd.Skewed(5)));
|
|
|
|
}
|
|
|
|
Test(&rnd);
|
|
|
|
|
|
|
|
// We must have created enough data to force merging
|
|
|
|
int files = 0;
|
|
|
|
for (int level = 0; level < db()->NumberLevels(); level++) {
|
|
|
|
std::string value;
|
|
|
|
char name[100];
|
|
|
|
snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level);
|
|
|
|
ASSERT_TRUE(db()->GetProperty(name, &value));
|
|
|
|
files += atoi(value.c_str());
|
|
|
|
}
|
|
|
|
ASSERT_GT(files, 0);
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
class MemTableTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
MemTableTest() {
|
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
|
|
|
auto table_factory = std::make_shared<SkipListFactory>();
|
|
|
|
options_.memtable_factory = table_factory;
|
|
|
|
ImmutableOptions ioptions(options_);
|
|
|
|
wb_ = new WriteBufferManager(options_.db_write_buffer_size);
|
|
|
|
memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_,
|
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
|
|
memtable_->Ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~MemTableTest() {
|
|
|
|
delete memtable_->Unref();
|
|
|
|
delete wb_;
|
|
|
|
}
|
|
|
|
|
|
|
|
MemTable* GetMemTable() { return memtable_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
MemTable* memtable_;
|
|
|
|
Options options_;
|
|
|
|
WriteBufferManager* wb_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(MemTableTest, Simple) {
|
|
|
|
WriteBatch batch;
|
|
|
|
WriteBatchInternal::SetSequence(&batch, 100);
|
|
|
|
ASSERT_OK(batch.Put(std::string("k1"), std::string("v1")));
|
|
|
|
ASSERT_OK(batch.Put(std::string("k2"), std::string("v2")));
|
|
|
|
ASSERT_OK(batch.Put(std::string("k3"), std::string("v3")));
|
|
|
|
ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge")));
|
|
|
|
ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua")));
|
|
|
|
ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end")));
|
|
|
|
ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable());
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
9 years ago
|
|
|
ASSERT_TRUE(
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
|
|
|
|
.ok());
|
|
|
|
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
Arena arena;
|
|
|
|
ScopedArenaIterator arena_iter_guard;
|
|
|
|
std::unique_ptr<InternalIterator> iter_guard;
|
|
|
|
InternalIterator* iter;
|
|
|
|
if (i == 0) {
|
|
|
|
iter = GetMemTable()->NewIterator(ReadOptions(), &arena);
|
|
|
|
arena_iter_guard.set(iter);
|
|
|
|
} else {
|
|
|
|
iter = GetMemTable()->NewRangeTombstoneIterator(
|
|
|
|
ReadOptions(), kMaxSequenceNumber /* read_seq */);
|
|
|
|
iter_guard.reset(iter);
|
|
|
|
}
|
|
|
|
if (iter == nullptr) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
fprintf(stderr, "key: '%s' -> '%s'\n", iter->key().ToString().c_str(),
|
|
|
|
iter->value().ToString().c_str());
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test the empty key
|
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleEmptyKey) {
|
|
|
|
Random rnd(test::RandomSeed() + 1);
|
|
|
|
Add("", "v");
|
|
|
|
Test(&rnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleSingle) {
|
|
|
|
Random rnd(test::RandomSeed() + 2);
|
|
|
|
Add("abc", "v");
|
|
|
|
Test(&rnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleMulti) {
|
|
|
|
Random rnd(test::RandomSeed() + 3);
|
|
|
|
Add("abc", "v");
|
|
|
|
Add("abcd", "v");
|
|
|
|
Add("ac", "v2");
|
|
|
|
Test(&rnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleSpecialKey) {
|
|
|
|
Random rnd(test::RandomSeed() + 4);
|
|
|
|
Add("\xff\xff", "v3");
|
|
|
|
Test(&rnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(TableTest, FooterTests) {
|
|
|
|
Random* r = Random::GetTLSInstance();
|
|
|
|
uint64_t data_size = (uint64_t{1} << r->Uniform(40)) + r->Uniform(100);
|
|
|
|
uint64_t index_size = r->Uniform(1000000000);
|
|
|
|
uint64_t metaindex_size = r->Uniform(1000000);
|
|
|
|
// 5 == block trailer size
|
|
|
|
BlockHandle index(data_size + 5, index_size);
|
|
|
|
BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
|
|
|
|
uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
|
|
|
|
{
|
|
|
|
// legacy block based
|
|
|
|
FooterBuilder footer;
|
|
|
|
footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
|
|
|
|
footer_offset, kCRC32c, meta_index, index);
|
|
|
|
Footer decoded_footer;
|
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
|
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
|
|
|
|
ASSERT_EQ(decoded_footer.format_version(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
|
|
|
|
// Ensure serialized with legacy magic
|
|
|
|
ASSERT_EQ(
|
|
|
|
DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
|
|
|
|
kLegacyBlockBasedTableMagicNumber);
|
|
|
|
}
|
|
|
|
// block based, various checksums, various versions
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
for (auto t : GetSupportedChecksums()) {
|
|
|
|
for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
|
|
|
|
FooterBuilder footer;
|
|
|
|
footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t,
|
|
|
|
meta_index, index);
|
|
|
|
Footer decoded_footer;
|
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(),
|
|
|
|
kBlockBasedTableMagicNumber);
|
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), t);
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
|
|
|
|
meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
|
|
|
|
ASSERT_EQ(decoded_footer.format_version(), fv);
|
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Plain table is not supported in ROCKSDB_LITE
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
{
|
|
|
|
// legacy plain table
|
|
|
|
FooterBuilder footer;
|
|
|
|
footer.Build(kPlainTableMagicNumber, /* format_version */ 0, footer_offset,
|
|
|
|
kNoChecksum, meta_index);
|
|
|
|
Footer decoded_footer;
|
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
|
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.format_version(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
|
|
|
|
// Ensure serialized with legacy magic
|
|
|
|
ASSERT_EQ(
|
|
|
|
DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
|
|
|
|
kLegacyPlainTableMagicNumber);
|
|
|
|
}
|
|
|
|
{
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
// xxhash plain table (not currently used)
|
|
|
|
FooterBuilder footer;
|
|
|
|
footer.Build(kPlainTableMagicNumber, /* format_version */ 1, footer_offset,
|
|
|
|
kxxHash, meta_index);
|
|
|
|
Footer decoded_footer;
|
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
|
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.format_version(), 1U);
|
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
class IndexBlockRestartIntervalTest
|
|
|
|
: public TableTest,
|
|
|
|
public ::testing::WithParamInterface<std::pair<int, bool>> {
|
|
|
|
public:
|
|
|
|
static std::vector<std::pair<int, bool>> GetRestartValues() {
|
|
|
|
return {{-1, false}, {0, false}, {1, false}, {8, false},
|
|
|
|
{16, false}, {32, false}, {-1, true}, {0, true},
|
|
|
|
{1, true}, {8, true}, {16, true}, {32, true}};
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest,
|
|
|
|
::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues()));
|
|
|
|
|
|
|
|
TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
|
|
|
|
const int kKeysInTable = 10000;
|
|
|
|
const int kKeySize = 100;
|
|
|
|
const int kValSize = 500;
|
|
|
|
|
|
|
|
const int index_block_restart_interval = std::get<0>(GetParam());
|
|
|
|
const bool value_delta_encoding = std::get<1>(GetParam());
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 64; // small block size to get big index block
|
|
|
|
table_options.index_block_restart_interval = index_block_restart_interval;
|
|
|
|
if (value_delta_encoding) {
|
|
|
|
table_options.format_version = 4;
|
|
|
|
} else {
|
|
|
|
table_options.format_version = 3;
|
|
|
|
}
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
static Random rnd(301);
|
|
|
|
for (int i = 0; i < kKeysInTable; i++) {
|
|
|
|
InternalKey k(rnd.RandomString(kKeySize), 0, kTypeValue);
|
|
|
|
c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Test point lookup
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
db_iter->Seek(kv.first);
|
|
|
|
|
|
|
|
ASSERT_TRUE(db_iter->Valid());
|
|
|
|
ASSERT_OK(db_iter->status());
|
|
|
|
ASSERT_EQ(db_iter->key(), kv.first);
|
|
|
|
ASSERT_EQ(db_iter->value(), kv.second);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test iterating
|
|
|
|
auto kv_iter = kvmap.begin();
|
|
|
|
for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
|
|
|
|
ASSERT_EQ(db_iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(db_iter->value(), kv_iter->second);
|
|
|
|
kv_iter++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(kv_iter, kvmap.end());
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
9 years ago
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
class PrefixTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
PrefixTest() : testing::Test() {}
|
|
|
|
~PrefixTest() override {}
|
|
|
|
};
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest
|
|
|
|
class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
|
|
|
|
public:
|
|
|
|
~TestPrefixExtractor() override{};
|
|
|
|
const char* Name() const override { return "TestPrefixExtractor"; }
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::Slice Transform(
|
|
|
|
const ROCKSDB_NAMESPACE::Slice& src) const override {
|
|
|
|
assert(IsValid(src));
|
|
|
|
return ROCKSDB_NAMESPACE::Slice(src.data(), 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override {
|
Fix a bug for SeekForPrev with partitioned filter and prefix (#8137)
Summary:
According to https://github.com/facebook/rocksdb/issues/5907, each filter partition "should include the bloom of the prefix of the last
key in the previous partition" so that SeekForPrev() in prefix mode can return correct result.
The prefix of the last key in the previous partition does not necessarily have the same prefix
as the first key in the current partition. Regardless of the first key in current partition, the
prefix of the last key in the previous partition should be added. The existing code, however,
does not follow this. Furthermore, there is another issue: when finishing current filter partition,
`FullFilterBlockBuilder::AddPrefix()` is called for the first key in next filter partition, which effectively
overwrites `last_prefix_str_` prematurely. Consequently, when the filter block builder proceeds
to the next partition, `last_prefix_str_` will be the prefix of its first key, leaving no way of adding
the bloom of the prefix of the last key of the previous partition.
Prefix extractor is FixedLength.2.
```
[ filter part 1 ] [ filter part 2 ]
abc d
```
When SeekForPrev("abcd"), checking the filter partition will land on filter part 2 because "abcd" > "abc"
but smaller than "d".
If the filter in filter part 2 happens to return false for the test for "ab", then SeekForPrev("abcd") will build
incorrect iterator tree in non-total-order mode.
Also fix a unit test which starts to fail following this PR. `InDomain` should not fail due to assertion
error when checking on an arbitrary key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8137
Test Plan:
```
make check
```
Without this fix, the following command will fail pretty soon.
```
./db_stress --acquire_snapshot_one_in=10000 --avoid_flush_during_recovery=0 \
--avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 \
--batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=17 \
--bottommost_compression_type=disable --cache_index_and_filter_blocks=1 --cache_size=1048576 \
--checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 \
--compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_ttl=0 \
--compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 \
--compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 \
--continuous_verification_interval=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox \
--db_write_buffer_size=8388608 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --enable_blob_files=0 \
--enable_compaction_filter=0 --enable_pipelined_write=1 --file_checksum_impl=big --flush_one_in=1000000 \
--format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 \
--get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --index_type=2 --ingest_external_file_one_in=0 \
--iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True \
--log2_keys_per_lock=10 --long_running_snapshots=1 --mark_for_compaction_one_file_in=0 \
--max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000000 --max_key_len=3 \
--max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 \
--max_write_buffer_size_to_maintain=8388608 --memtablerep=skip_list --mmap_read=1 --mock_direct_io=False \
--nooverwritepercent=0 --open_files=500000 --ops_per_thread=20000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=1 --partition_pinning=0 --pause_background_one_in=1000000 \
--periodic_compaction_seconds=0 --prefixpercent=5 --progress_reports=0 --read_fault_one_in=0 --read_only=0 \
--readpercent=45 --recycle_log_file_num=0 --reopen=20 --secondary_catch_up_one_in=0 \
--snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 \
--sst_file_manager_bytes_per_truncate=0 --subcompactions=2 --sync=0 --sync_fault_injection=False \
--target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=0 \
--top_level_index_pinning=0 --unpartitioned_pinning=1 --use_blob_db=0 --use_block_based_filter=0 \
--use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 \
--use_multiget=0 --use_ribbon_filter=0 --use_txn=0 --user_timestamp_size=8 --verify_checksum=1 \
--verify_checksum_one_in=1000000 --verify_db_one_in=100000 --write_buffer_size=4194304 \
--write_dbid_to_manifest=1 --writepercent=35
```
Reviewed By: pdillinger
Differential Revision: D27553054
Pulled By: riversand963
fbshipit-source-id: 60e391e4a2d8d98a9a3172ec5d6176b90ec3de98
4 years ago
|
|
|
return IsValid(src);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const {
|
|
|
|
if (src.size() != 4) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[0] != '[') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[1] < '0' || src[1] > '9') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[2] != ']') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[3] < '0' || src[3] > '9') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
|
|
|
|
ROCKSDB_NAMESPACE::Options options;
|
|
|
|
options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 20;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.optimize_filters_for_hits = false;
|
|
|
|
options.target_file_size_base = 268435456;
|
|
|
|
options.prefix_extractor = std::make_shared<TestPrefixExtractor>();
|
|
|
|
ROCKSDB_NAMESPACE::BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
|
|
|
|
bbto.block_size = 262144;
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
|
|
|
|
const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
ASSERT_OK(DestroyDB(kDBPath, options));
|
|
|
|
ROCKSDB_NAMESPACE::DB* db;
|
|
|
|
ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
|
|
|
|
|
|
|
|
// Create a bunch of keys with 10 filters.
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
std::string prefix = "[" + std::to_string(i) + "]";
|
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
std::string key = prefix + std::to_string(j);
|
|
|
|
ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Trigger compaction.
|
|
|
|
ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
delete db;
|
|
|
|
// In the second round, turn whole_key_filtering off and expect
|
|
|
|
// rocksdb still works.
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in
|
|
|
|
* the SST file any more. Instead, RocksDB deduces global_seqno from the
|
|
|
|
* MANIFEST while reading from an SST. Therefore, it's not possible to test the
|
|
|
|
* functionality of global_seqno in a single, isolated unit test without the
|
|
|
|
* involvement of Version, VersionSet, etc.
|
|
|
|
*/
|
|
|
|
TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
|
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
|
|
|
Options options;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
int_tbl_prop_collector_factories.emplace_back(
|
|
|
|
new SstFileWriterPropertiesCollectorFactory(2 /* version */,
|
|
|
|
0 /* global_seqno*/));
|
|
|
|
std::string column_family_name;
|
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
|
|
|
column_family_name, -1),
|
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
for (char c = 'a'; c <= 'z'; ++c) {
|
|
|
|
std::string key(8, c);
|
|
|
|
std::string value = key;
|
|
|
|
InternalKey ik(key, 0, kTypeValue);
|
|
|
|
|
|
|
|
builder->Add(ik.Encode(), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
ASSERT_OK(file_writer->Flush());
|
|
|
|
|
|
|
|
test::RandomRWStringSink ss_rw(sink);
|
|
|
|
uint32_t version;
|
|
|
|
uint64_t global_seqno;
|
|
|
|
uint64_t global_seqno_offset;
|
|
|
|
|
|
|
|
// Helper function to get version, global_seqno, global_seqno_offset
|
|
|
|
std::function<void()> GetVersionAndGlobalSeqno = [&]() {
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss_rw.contents(), 73342, true));
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), ""));
|
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
std::unique_ptr<TableProperties> props;
|
|
|
|
ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
|
|
|
|
kBlockBasedTableMagicNumber, ioptions,
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
&props));
|
|
|
|
|
|
|
|
UserCollectedProperties user_props = props->user_collected_properties;
|
|
|
|
version = DecodeFixed32(
|
|
|
|
user_props[ExternalSstFilePropertyNames::kVersion].c_str());
|
|
|
|
global_seqno = DecodeFixed64(
|
|
|
|
user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str());
|
|
|
|
global_seqno_offset = props->external_sst_file_global_seqno_offset;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Helper function to update the value of the global seqno in the file
|
|
|
|
std::function<void(uint64_t)> SetGlobalSeqno = [&](uint64_t val) {
|
|
|
|
std::string new_global_seqno;
|
|
|
|
PutFixed64(&new_global_seqno, val);
|
|
|
|
|
|
|
|
ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(),
|
|
|
|
nullptr));
|
|
|
|
};
|
|
|
|
|
|
|
|
// Helper function to get the contents of the table InternalIterator
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
|
|
const ReadOptions read_options;
|
|
|
|
std::function<InternalIterator*()> GetTableInternalIter = [&]() {
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss_rw.contents(), 73342, true));
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), ""));
|
|
|
|
|
|
|
|
options.table_factory->NewTableReader(
|
|
|
|
TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
|
|
|
|
EnvOptions(), ikc),
|
|
|
|
std::move(file_reader), ss_rw.contents().size(), &table_reader);
|
|
|
|
|
|
|
|
return table_reader->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized);
|
|
|
|
};
|
|
|
|
|
|
|
|
GetVersionAndGlobalSeqno();
|
|
|
|
ASSERT_EQ(2u, version);
|
|
|
|
ASSERT_EQ(0u, global_seqno);
|
|
|
|
|
|
|
|
InternalIterator* iter = GetTableInternalIter();
|
|
|
|
char current_c = 'a';
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey pik;
|
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 0);
|
|
|
|
ASSERT_EQ(pik.user_key, iter->value());
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
|
|
|
|
current_c++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(current_c, 'z' + 1);
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// Update global sequence number to 10
|
|
|
|
SetGlobalSeqno(10);
|
|
|
|
GetVersionAndGlobalSeqno();
|
|
|
|
ASSERT_EQ(2u, version);
|
|
|
|
ASSERT_EQ(10u, global_seqno);
|
|
|
|
|
|
|
|
iter = GetTableInternalIter();
|
|
|
|
current_c = 'a';
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey pik;
|
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 10);
|
|
|
|
ASSERT_EQ(pik.user_key, iter->value());
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
|
|
|
|
current_c++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(current_c, 'z' + 1);
|
|
|
|
|
|
|
|
// Verify Seek
|
|
|
|
for (char c = 'a'; c <= 'z'; c++) {
|
|
|
|
std::string k = std::string(8, c);
|
|
|
|
InternalKey ik(k, 10, kValueTypeForSeek);
|
|
|
|
iter->Seek(ik.Encode());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
|
|
|
ParsedInternalKey pik;
|
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 10);
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), k);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), k);
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// Update global sequence number to 3
|
|
|
|
SetGlobalSeqno(3);
|
|
|
|
GetVersionAndGlobalSeqno();
|
|
|
|
ASSERT_EQ(2u, version);
|
|
|
|
ASSERT_EQ(3u, global_seqno);
|
|
|
|
|
|
|
|
iter = GetTableInternalIter();
|
|
|
|
current_c = 'a';
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey pik;
|
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 3);
|
|
|
|
ASSERT_EQ(pik.user_key, iter->value());
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
|
|
|
|
current_c++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(current_c, 'z' + 1);
|
|
|
|
|
|
|
|
// Verify Seek
|
|
|
|
for (char c = 'a'; c <= 'z'; c++) {
|
|
|
|
std::string k = std::string(8, c);
|
|
|
|
// seqno=4 is less than 3 so we still should get our key
|
|
|
|
InternalKey ik(k, 4, kValueTypeForSeek);
|
|
|
|
iter->Seek(ik.Encode());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
|
|
|
ParsedInternalKey pik;
|
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 3);
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), k);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), k);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BlockAlignTest) {
|
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
|
|
|
bbto.block_align = true;
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
std::string column_family_name;
|
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
|
|
|
column_family_name, -1),
|
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
for (int i = 1; i <= 10000; ++i) {
|
|
|
|
std::ostringstream ostr;
|
|
|
|
ostr << std::setfill('0') << std::setw(5) << i;
|
|
|
|
std::string key = ostr.str();
|
|
|
|
std::string value = "val";
|
|
|
|
InternalKey ik(key, 0, kTypeValue);
|
|
|
|
|
|
|
|
builder->Add(ik.Encode(), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
ASSERT_OK(file_writer->Flush());
|
|
|
|
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(sink->contents(), 73342, false));
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
// Helper function to get version, global_seqno, global_seqno_offset
|
|
|
|
std::function<void()> VerifyBlockAlignment = [&]() {
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
std::unique_ptr<TableProperties> props;
|
|
|
|
ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(),
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
kBlockBasedTableMagicNumber, ioptions,
|
|
|
|
&props));
|
|
|
|
|
|
|
|
uint64_t data_block_size = props->data_size / props->num_data_blocks;
|
|
|
|
ASSERT_EQ(data_block_size, 4096);
|
|
|
|
ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks);
|
|
|
|
};
|
|
|
|
|
|
|
|
VerifyBlockAlignment();
|
|
|
|
|
|
|
|
// The below block of code verifies that we can read back the keys. Set
|
|
|
|
// block_align to false when creating the reader to ensure we can flip between
|
|
|
|
// the two modes without any issues
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
|
|
bbto.block_align = false;
|
|
|
|
Options options2;
|
|
|
|
options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
ImmutableOptions ioptions2(options2);
|
|
|
|
const MutableCFOptions moptions2(options2);
|
|
|
|
|
|
|
|
ASSERT_OK(ioptions.table_factory->NewTableReader(
|
|
|
|
TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(),
|
|
|
|
EnvOptions(),
|
|
|
|
GetPlainInternalComparator(options2.comparator)),
|
|
|
|
std::move(file_reader), sink->contents().size(), &table_reader));
|
|
|
|
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
|
|
|
|
read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
int expected_key = 1;
|
|
|
|
for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
|
|
|
|
std::ostringstream ostr;
|
|
|
|
ostr << std::setfill('0') << std::setw(5) << expected_key++;
|
|
|
|
std::string key = ostr.str();
|
|
|
|
std::string value = "val";
|
|
|
|
|
|
|
|
ASSERT_OK(db_iter->status());
|
|
|
|
ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key);
|
|
|
|
ASSERT_EQ(db_iter->value().ToString(), value);
|
|
|
|
}
|
|
|
|
expected_key--;
|
|
|
|
ASSERT_EQ(expected_key, 10000);
|
|
|
|
table_reader.reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
|
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
|
|
|
bbto.block_align = true;
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
std::string column_family_name;
|
|
|
|
|
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
|
|
|
column_family_name, -1),
|
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
for (int i = 1; i <= 10000; ++i) {
|
|
|
|
std::ostringstream ostr;
|
|
|
|
ostr << std::setfill('0') << std::setw(5) << i;
|
|
|
|
std::string key = ostr.str();
|
|
|
|
std::string value = "val";
|
|
|
|
InternalKey ik(key, 0, kTypeValue);
|
|
|
|
|
|
|
|
builder->Add(ik.Encode(), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
ASSERT_OK(file_writer->Flush());
|
|
|
|
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(sink->contents(), 73342, true));
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
|
|
|
|
{
|
|
|
|
RandomAccessFileReader* file = file_reader.get();
|
|
|
|
uint64_t file_size = sink->contents().size();
|
|
|
|
|
|
|
|
Footer footer;
|
|
|
|
IOOptions opts;
|
|
|
|
ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */,
|
|
|
|
file_size, &footer,
|
|
|
|
kBlockBasedTableMagicNumber));
|
|
|
|
|
|
|
|
auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
|
|
|
|
BlockContents* contents) {
|
|
|
|
ReadOptions read_options;
|
|
|
|
read_options.verify_checksums = false;
|
|
|
|
PersistentCacheOptions cache_options;
|
|
|
|
|
|
|
|
BlockFetcher block_fetcher(
|
|
|
|
file, nullptr /* prefetch_buffer */, footer, read_options, handle,
|
|
|
|
contents, ioptions, false /* decompress */,
|
|
|
|
false /*maybe_compressed*/, block_type,
|
|
|
|
UncompressionDict::GetEmptyDict(), cache_options);
|
|
|
|
|
|
|
|
ASSERT_OK(block_fetcher.ReadBlockContents());
|
|
|
|
};
|
|
|
|
|
|
|
|
// -- Read metaindex block
|
|
|
|
auto metaindex_handle = footer.metaindex_handle();
|
|
|
|
BlockContents metaindex_contents;
|
|
|
|
|
|
|
|
BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
|
|
|
|
&metaindex_contents);
|
|
|
|
Block metaindex_block(std::move(metaindex_contents));
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
|
|
|
|
BytewiseComparator(), kDisableGlobalSequenceNumber));
|
|
|
|
|
|
|
|
// -- Read properties block
|
|
|
|
BlockHandle properties_handle;
|
|
|
|
ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlockName,
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
&properties_handle));
|
|
|
|
ASSERT_FALSE(properties_handle.IsNull());
|
|
|
|
BlockContents properties_contents;
|
|
|
|
BlockFetchHelper(properties_handle, BlockType::kProperties,
|
|
|
|
&properties_contents);
|
|
|
|
Block properties_block(std::move(properties_contents));
|
|
|
|
|
|
|
|
ASSERT_EQ(properties_block.NumRestarts(), 1u);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
|
|
|
|
// The properties meta-block should come at the end since we always need to
|
|
|
|
// read it when opening a file, unlike index/filter/other meta-blocks, which
|
|
|
|
// are sometimes read depending on the user's configuration. This ordering
|
|
|
|
// allows us to do a small readahead on the end of the file to read properties
|
|
|
|
// and meta-index blocks with one I/O.
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("a1", "val1");
|
|
|
|
c.Add("b2", "val2");
|
|
|
|
c.Add("c3", "val3");
|
|
|
|
c.Add("d4", "val4");
|
|
|
|
c.Add("e5", "val5");
|
|
|
|
c.Add("f6", "val6");
|
|
|
|
c.Add("g7", "val7");
|
|
|
|
c.Add("h8", "val8");
|
|
|
|
c.Add("j9", "val9");
|
|
|
|
|
|
|
|
// write an SST file
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(
|
|
|
|
8 /* bits_per_key */, false /* use_block_based_filter */));
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
// get file reader
|
|
|
|
test::StringSink* table_sink = c.TEST_GetSink();
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
|
|
|
|
table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
|
|
|
|
|
|
|
|
std::unique_ptr<RandomAccessFileReader> table_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
size_t table_size = table_sink->contents().size();
|
|
|
|
|
|
|
|
// read footer
|
|
|
|
Footer footer;
|
|
|
|
IOOptions opts;
|
|
|
|
ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(),
|
|
|
|
nullptr /* prefetch_buffer */, table_size,
|
|
|
|
&footer, kBlockBasedTableMagicNumber));
|
|
|
|
|
|
|
|
// read metaindex
|
|
|
|
auto metaindex_handle = footer.metaindex_handle();
|
|
|
|
BlockContents metaindex_contents;
|
|
|
|
PersistentCacheOptions pcache_opts;
|
|
|
|
BlockFetcher block_fetcher(
|
|
|
|
table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
|
|
|
|
metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
|
|
|
|
false /*maybe_compressed*/, BlockType::kMetaIndex,
|
|
|
|
UncompressionDict::GetEmptyDict(), pcache_opts,
|
|
|
|
nullptr /*memory_allocator*/);
|
|
|
|
ASSERT_OK(block_fetcher.ReadBlockContents());
|
|
|
|
Block metaindex_block(std::move(metaindex_contents));
|
|
|
|
|
|
|
|
// verify properties block comes last
|
|
|
|
std::unique_ptr<InternalIterator> metaindex_iter{
|
|
|
|
metaindex_block.NewDataIterator(options.comparator,
|
|
|
|
kDisableGlobalSequenceNumber)};
|
|
|
|
uint64_t max_offset = 0;
|
|
|
|
std::string key_at_max_offset;
|
|
|
|
for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
|
|
|
|
metaindex_iter->Next()) {
|
|
|
|
BlockHandle handle;
|
|
|
|
Slice value = metaindex_iter->value();
|
|
|
|
ASSERT_OK(handle.DecodeFrom(&value));
|
|
|
|
if (handle.offset() > max_offset) {
|
|
|
|
max_offset = handle.offset();
|
|
|
|
key_at_max_offset = metaindex_iter->key().ToString();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(kPropertiesBlockName, key_at_max_offset);
|
|
|
|
// index handle is stored in footer rather than metaindex block, so need
|
|
|
|
// separate logic to verify it comes before properties block.
|
|
|
|
ASSERT_GT(max_offset, footer.index_handle().offset());
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BadOptions) {
|
|
|
|
ROCKSDB_NAMESPACE::Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
|
|
|
bbto.block_size = 4000;
|
|
|
|
bbto.block_align = true;
|
|
|
|
|
|
|
|
const std::string kDBPath =
|
|
|
|
test::PerThreadDBPath("block_based_table_bad_options_test");
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
ASSERT_OK(DestroyDB(kDBPath, options));
|
|
|
|
ROCKSDB_NAMESPACE::DB* db;
|
|
|
|
ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
|
|
|
|
|
|
|
|
bbto.block_size = 4096;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) {
|
|
|
|
TailPrefetchStats tpstats;
|
|
|
|
ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1000});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1005});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1002});
|
|
|
|
ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
|
|
|
|
// One single super large value shouldn't influence much
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1002000});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{999});
|
|
|
|
ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
|
|
|
|
// Only history of 32 is kept
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
tpstats.RecordEffectiveSize(size_t{100});
|
|
|
|
}
|
|
|
|
ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
|
|
|
|
// 16 large values and 16 small values. The result should be closer
|
|
|
|
// to the small value as the algorithm.
|
|
|
|
for (int i = 0; i < 16; i++) {
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1000});
|
|
|
|
}
|
|
|
|
tpstats.RecordEffectiveSize(size_t{10});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{20});
|
|
|
|
for (int i = 0; i < 6; i++) {
|
|
|
|
tpstats.RecordEffectiveSize(size_t{100});
|
|
|
|
}
|
|
|
|
ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
|
|
|
|
TailPrefetchStats tpstats;
|
|
|
|
FilePrefetchBuffer buffer(0 /* readahead_size */, 0 /* max_readahead_size */,
|
|
|
|
false /* enable */, true /* track_min_offset */);
|
|
|
|
IOOptions opts;
|
|
|
|
buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */,
|
|
|
|
10 /* n */, nullptr /* result */,
|
|
|
|
nullptr /* status */);
|
|
|
|
buffer.TryReadFromCache(opts, nullptr /* reader */, 480 /* offset */,
|
|
|
|
10 /* n */, nullptr /* result */,
|
|
|
|
nullptr /* status */);
|
|
|
|
buffer.TryReadFromCache(opts, nullptr /* reader */, 490 /* offset */,
|
|
|
|
10 /* n */, nullptr /* result */,
|
|
|
|
nullptr /* status */);
|
|
|
|
ASSERT_EQ(480, buffer.min_offset_read());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
|
|
|
|
const int kNumKeys = 500;
|
|
|
|
const int kKeySize = 8;
|
|
|
|
const int kValSize = 40;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.data_block_index_type =
|
|
|
|
BlockBasedTableOptions::kDataBlockBinaryAndHash;
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
|
|
|
|
static Random rnd(1048);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
// padding one "0" to mark existent keys.
|
|
|
|
std::string random_key(rnd.RandomString(kKeySize - 1) + "1");
|
|
|
|
InternalKey k(random_key, 0, kTypeValue);
|
|
|
|
c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> seek_iter;
|
|
|
|
ReadOptions read_options;
|
|
|
|
seek_iter.reset(reader->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
ReadOptions ro;
|
|
|
|
// for every kv, we seek using two method: Get() and Seek()
|
|
|
|
// Get() will use the SuffixIndexHash in Block. For non-existent key it
|
|
|
|
// will invalidate the iterator
|
|
|
|
// Seek() will use the default BinarySeek() in Block. So for non-existent
|
|
|
|
// key it will land at the closest key that is large than target.
|
|
|
|
|
|
|
|
// Search for existent keys
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
if (i == 0) {
|
|
|
|
// Search using Seek()
|
|
|
|
seek_iter->Seek(kv.first);
|
|
|
|
ASSERT_OK(seek_iter->status());
|
|
|
|
ASSERT_TRUE(seek_iter->Valid());
|
|
|
|
ASSERT_EQ(seek_iter->key(), kv.first);
|
|
|
|
ASSERT_EQ(seek_iter->value(), kv.second);
|
|
|
|
} else {
|
|
|
|
// Search using Get()
|
|
|
|
PinnableSlice value;
|
|
|
|
std::string user_key = ExtractUserKey(kv.first).ToString();
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, true, nullptr, nullptr);
|
|
|
|
ASSERT_OK(reader->Get(ro, kv.first, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
|
|
ASSERT_EQ(value, Slice(kv.second));
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Search for non-existent keys
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
std::string user_key = ExtractUserKey(kv.first).ToString();
|
|
|
|
user_key.back() = '0'; // make it non-existent key
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
if (i == 0) { // Search using Seek()
|
|
|
|
seek_iter->Seek(encoded_key);
|
|
|
|
ASSERT_OK(seek_iter->status());
|
|
|
|
if (seek_iter->Valid()) {
|
|
|
|
ASSERT_TRUE(BytewiseComparator()->Compare(
|
|
|
|
user_key, ExtractUserKey(seek_iter->key())) < 0);
|
|
|
|
}
|
|
|
|
} else { // Search using Get()
|
|
|
|
PinnableSlice value;
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, true, nullptr, nullptr);
|
|
|
|
ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kNotFound);
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// BlockBasedTableIterator should invalidate itself and return
|
|
|
|
// OutOfBound()=true immediately after Seek(), to allow LevelIterator
|
|
|
|
// filter out corresponding level.
|
|
|
|
TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
|
|
|
|
c.Add("foo", "v1");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_opt,
|
|
|
|
GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
ReadOptions read_opt;
|
|
|
|
std::string upper_bound = "bar";
|
|
|
|
Slice upper_bound_slice(upper_bound);
|
|
|
|
read_opt.iterate_upper_bound = &upper_bound_slice;
|
|
|
|
std::unique_ptr<InternalIterator> iter;
|
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
|
|
|
}
|
|
|
|
|
|
|
|
// BlockBasedTableIterator should invalidate itself and return
|
|
|
|
// OutOfBound()=true after Next(), if it finds current index key is no smaller
|
|
|
|
// than upper bound, unless it is pointing to the last data block.
|
|
|
|
TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
|
|
|
|
c.Add("bar", "v");
|
|
|
|
c.Add("foo", "v");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
|
|
|
|
table_opt.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_opt,
|
|
|
|
GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
ReadOptions read_opt;
|
|
|
|
std::string ub1 = "bar_after";
|
|
|
|
Slice ub_slice1(ub1);
|
|
|
|
read_opt.iterate_upper_bound = &ub_slice1;
|
|
|
|
std::unique_ptr<InternalIterator> iter;
|
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
|
|
|
iter->Seek("bar");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bar", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
|
|
|
std::string ub2 = "foo_after";
|
|
|
|
Slice ub_slice2(ub2);
|
|
|
|
read_opt.iterate_upper_bound = &ub_slice2;
|
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_FALSE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(
|
|
|
|
BlockBasedTableTest,
|
|
|
|
IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnBuilderFinish) {
|
|
|
|
constexpr std::size_t kSizeDummyEntry = 256 * 1024;
|
|
|
|
constexpr std::size_t kMetaDataChargeOverhead = 10000;
|
|
|
|
constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
|
|
|
|
constexpr std::size_t kMaxDictBytes = 1024;
|
|
|
|
constexpr std::size_t kMaxDictBufferBytes = 1024;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
LRUCacheOptions lo;
|
|
|
|
lo.capacity = kCacheCapacity;
|
|
|
|
lo.num_shard_bits = 0; // 2^0 shard
|
|
|
|
lo.strict_capacity_limit = true;
|
|
|
|
std::shared_ptr<Cache> cache(NewLRUCache(lo));
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.compression_opts.max_dict_bytes = kMaxDictBytes;
|
|
|
|
options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "test_file_name", FileOptions()));
|
|
|
|
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
|
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kSnappyCompression,
|
|
|
|
options.compression_opts, kUnknownColumnFamily,
|
|
|
|
"test_cf", -1 /* level */),
|
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
std::string key1 = "key1";
|
|
|
|
std::string value1 = "val1";
|
|
|
|
InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
|
|
|
|
// therefore won't trigger any data block's buffering
|
|
|
|
builder->Add(ik1.Encode(), value1);
|
|
|
|
ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
std::string key2 = "key2";
|
|
|
|
std::string value2 = "val2";
|
|
|
|
InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the second key will trigger a flush of the last data block (the one
|
|
|
|
// containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of that data block.
|
|
|
|
builder->Add(ik2.Encode(), value2);
|
|
|
|
// Cache reservation will increase for last buffered data block (the one
|
|
|
|
// containing key1 and value1) since the buffer limit is not exceeded after
|
|
|
|
// that buffering and the cache will not be full after this reservation
|
|
|
|
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
|
|
|
|
EXPECT_LT(cache->GetPinnedUsage(),
|
|
|
|
1 * kSizeDummyEntry + kMetaDataChargeOverhead);
|
|
|
|
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(
|
|
|
|
BlockBasedTableTest,
|
|
|
|
IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnBufferLimitExceed) {
|
|
|
|
constexpr std::size_t kSizeDummyEntry = 256 * 1024;
|
|
|
|
constexpr std::size_t kMetaDataChargeOverhead = 10000;
|
|
|
|
constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
|
|
|
|
constexpr std::size_t kMaxDictBytes = 1024;
|
|
|
|
constexpr std::size_t kMaxDictBufferBytes = 2 * kSizeDummyEntry;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
LRUCacheOptions lo;
|
|
|
|
lo.capacity = kCacheCapacity;
|
|
|
|
lo.num_shard_bits = 0; // 2^0 shard
|
|
|
|
lo.strict_capacity_limit = true;
|
|
|
|
std::shared_ptr<Cache> cache(NewLRUCache(lo));
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.compression_opts.max_dict_bytes = kMaxDictBytes;
|
|
|
|
options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "test_file_name", FileOptions()));
|
|
|
|
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
|
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kSnappyCompression,
|
|
|
|
options.compression_opts, kUnknownColumnFamily,
|
|
|
|
"test_cf", -1 /* level */),
|
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
std::string key1 = "key1";
|
|
|
|
std::string value1(kSizeDummyEntry, '0');
|
|
|
|
InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
|
|
|
|
// therefore won't trigger any data block's buffering
|
|
|
|
builder->Add(ik1.Encode(), value1);
|
|
|
|
ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
std::string key2 = "key2";
|
|
|
|
std::string value2(kSizeDummyEntry, '0');
|
|
|
|
InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the second key will trigger a flush of the last data block (the one
|
|
|
|
// containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik2.Encode(), value2);
|
|
|
|
// Cache reservation will increase for last buffered data block (the one
|
|
|
|
// containing key1 and value1) since the buffer limit is not exceeded after
|
|
|
|
// the buffering and the cache will not be full after this reservation
|
|
|
|
EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
|
|
|
|
EXPECT_LT(cache->GetPinnedUsage(),
|
|
|
|
2 * kSizeDummyEntry + kMetaDataChargeOverhead);
|
|
|
|
|
|
|
|
std::string key3 = "key3";
|
|
|
|
std::string value3 = "val3";
|
|
|
|
InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the third key will trigger a flush of the last data block (the one
|
|
|
|
// containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik3.Encode(), value3);
|
|
|
|
// Cache reservation will decrease since the buffer limit is now exceeded
|
|
|
|
// after the last buffering and EnterUnbuffered() is triggered
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(
|
|
|
|
BlockBasedTableTest,
|
|
|
|
IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnCacheFull) {
|
|
|
|
constexpr std::size_t kSizeDummyEntry = 256 * 1024;
|
|
|
|
constexpr std::size_t kMetaDataChargeOverhead = 10000;
|
|
|
|
// A small kCacheCapacity is chosen so that increase cache reservation for
|
|
|
|
// buffering two data blocks, each containing key1/value1, key2/a big
|
|
|
|
// value2, will cause cache full
|
|
|
|
constexpr std::size_t kCacheCapacity =
|
|
|
|
1 * kSizeDummyEntry + kSizeDummyEntry / 2;
|
|
|
|
constexpr std::size_t kMaxDictBytes = 1024;
|
|
|
|
// A big kMaxDictBufferBytes is chosen so that adding a big key value pair
|
|
|
|
// (key2, value2) won't exceed the buffer limit
|
|
|
|
constexpr std::size_t kMaxDictBufferBytes = 1024 * 1024 * 1024;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
LRUCacheOptions lo;
|
|
|
|
lo.capacity = kCacheCapacity;
|
|
|
|
lo.num_shard_bits = 0; // 2^0 shard
|
|
|
|
lo.strict_capacity_limit = true;
|
|
|
|
std::shared_ptr<Cache> cache(NewLRUCache(lo));
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.compression_opts.max_dict_bytes = kMaxDictBytes;
|
|
|
|
options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "test_file_name", FileOptions()));
|
|
|
|
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
|
|
|
|
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, ikc,
|
|
|
|
&int_tbl_prop_collector_factories, kSnappyCompression,
|
|
|
|
options.compression_opts, kUnknownColumnFamily,
|
|
|
|
"test_cf", -1 /* level */),
|
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
std::string key1 = "key1";
|
|
|
|
std::string value1 = "val1";
|
|
|
|
InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
|
|
|
|
// therefore won't trigger any data block's buffering
|
|
|
|
builder->Add(ik1.Encode(), value1);
|
|
|
|
ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
std::string key2 = "key2";
|
|
|
|
std::string value2(kSizeDummyEntry, '0');
|
|
|
|
InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the second key will trigger a flush of the last data block (the one
|
|
|
|
// containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik2.Encode(), value2);
|
|
|
|
// Cache reservation will increase for the last buffered data block (the one
|
|
|
|
// containing key1 and value1) since the buffer limit is not exceeded after
|
|
|
|
// the buffering and the cache will not be full after this reservation
|
|
|
|
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
|
|
|
|
EXPECT_LT(cache->GetPinnedUsage(),
|
|
|
|
1 * kSizeDummyEntry + kMetaDataChargeOverhead);
|
|
|
|
|
|
|
|
std::string key3 = "key3";
|
|
|
|
std::string value3 = "value3";
|
|
|
|
InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the third key will trigger a flush of the last data block (the one
|
|
|
|
// containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik3.Encode(), value3);
|
|
|
|
// Cache reservation will decrease since the cache is now full after
|
|
|
|
// increasing reservation for the last buffered block and EnterUnbuffered() is
|
|
|
|
// triggered
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|