|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "table/block_based/index_builder.h"
|
|
|
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <list>
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "rocksdb/comparator.h"
|
|
|
|
#include "rocksdb/flush_block_policy.h"
|
|
|
|
#include "table/block_based/partitioned_filter_block.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
// Create a index builder based on its type.
|
|
|
|
IndexBuilder* IndexBuilder::CreateIndexBuilder(
|
|
|
|
BlockBasedTableOptions::IndexType index_type,
|
|
|
|
const InternalKeyComparator* comparator,
|
|
|
|
const InternalKeySliceTransform* int_key_slice_transform,
|
|
|
|
const bool use_value_delta_encoding,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
const BlockBasedTableOptions& table_opt, size_t ts_sz,
|
|
|
|
const bool persist_user_defined_timestamps) {
|
|
|
|
IndexBuilder* result = nullptr;
|
|
|
|
switch (index_type) {
|
|
|
|
case BlockBasedTableOptions::kBinarySearch: {
|
|
|
|
result = new ShortenedIndexBuilder(
|
|
|
|
comparator, table_opt.index_block_restart_interval,
|
|
|
|
table_opt.format_version, use_value_delta_encoding,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
table_opt.index_shortening, /* include_first_key */ false, ts_sz,
|
|
|
|
persist_user_defined_timestamps);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case BlockBasedTableOptions::kHashSearch: {
|
|
|
|
// Currently kHashSearch is incompatible with index_block_restart_interval
|
|
|
|
// > 1
|
|
|
|
assert(table_opt.index_block_restart_interval == 1);
|
|
|
|
result = new HashIndexBuilder(
|
|
|
|
comparator, int_key_slice_transform,
|
|
|
|
table_opt.index_block_restart_interval, table_opt.format_version,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
use_value_delta_encoding, table_opt.index_shortening, ts_sz,
|
|
|
|
persist_user_defined_timestamps);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case BlockBasedTableOptions::kTwoLevelIndexSearch: {
|
|
|
|
result = PartitionedIndexBuilder::CreateIndexBuilder(
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
comparator, use_value_delta_encoding, table_opt, ts_sz,
|
|
|
|
persist_user_defined_timestamps);
|
|
|
|
break;
|
|
|
|
}
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
5 years ago
|
|
|
case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
|
|
|
|
result = new ShortenedIndexBuilder(
|
|
|
|
comparator, table_opt.index_block_restart_interval,
|
|
|
|
table_opt.format_version, use_value_delta_encoding,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
table_opt.index_shortening, /* include_first_key */ true, ts_sz,
|
|
|
|
persist_user_defined_timestamps);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
assert(!"Do not recognize the index type ");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ShortenedIndexBuilder::FindShortestInternalKeySeparator(
|
|
|
|
const Comparator& comparator, std::string* start, const Slice& limit) {
|
|
|
|
// Attempt to shorten the user portion of the key
|
|
|
|
Slice user_start = ExtractUserKey(*start);
|
|
|
|
Slice user_limit = ExtractUserKey(limit);
|
|
|
|
std::string tmp(user_start.data(), user_start.size());
|
|
|
|
comparator.FindShortestSeparator(&tmp, user_limit);
|
|
|
|
if (tmp.size() <= user_start.size() &&
|
|
|
|
comparator.Compare(user_start, tmp) < 0) {
|
|
|
|
// User key has become shorter physically, but larger logically.
|
|
|
|
// Tack on the earliest possible number to the shortened user key.
|
|
|
|
PutFixed64(&tmp,
|
|
|
|
PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
|
|
|
|
assert(InternalKeyComparator(&comparator).Compare(*start, tmp) < 0);
|
|
|
|
assert(InternalKeyComparator(&comparator).Compare(tmp, limit) < 0);
|
|
|
|
start->swap(tmp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ShortenedIndexBuilder::FindShortInternalKeySuccessor(
|
|
|
|
const Comparator& comparator, std::string* key) {
|
|
|
|
Slice user_key = ExtractUserKey(*key);
|
|
|
|
std::string tmp(user_key.data(), user_key.size());
|
|
|
|
comparator.FindShortSuccessor(&tmp);
|
|
|
|
if (tmp.size() <= user_key.size() && comparator.Compare(user_key, tmp) < 0) {
|
|
|
|
// User key has become shorter physically, but larger logically.
|
|
|
|
// Tack on the earliest possible number to the shortened user key.
|
|
|
|
PutFixed64(&tmp,
|
|
|
|
PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
|
|
|
|
assert(InternalKeyComparator(&comparator).Compare(*key, tmp) < 0);
|
|
|
|
key->swap(tmp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
|
|
|
|
const InternalKeyComparator* comparator,
|
|
|
|
const bool use_value_delta_encoding,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
const BlockBasedTableOptions& table_opt, size_t ts_sz,
|
|
|
|
const bool persist_user_defined_timestamps) {
|
|
|
|
return new PartitionedIndexBuilder(comparator, table_opt,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
use_value_delta_encoding, ts_sz,
|
|
|
|
persist_user_defined_timestamps);
|
|
|
|
}
|
|
|
|
|
|
|
|
PartitionedIndexBuilder::PartitionedIndexBuilder(
|
|
|
|
const InternalKeyComparator* comparator,
|
|
|
|
const BlockBasedTableOptions& table_opt,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
const bool use_value_delta_encoding, size_t ts_sz,
|
|
|
|
const bool persist_user_defined_timestamps)
|
|
|
|
: IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps),
|
|
|
|
index_block_builder_(
|
|
|
|
table_opt.index_block_restart_interval, true /*use_delta_encoding*/,
|
|
|
|
use_value_delta_encoding,
|
|
|
|
BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */,
|
|
|
|
0.75 /* data_block_hash_table_util_ratio */, ts_sz,
|
|
|
|
persist_user_defined_timestamps, false /* is_user_key */),
|
|
|
|
index_block_builder_without_seq_(
|
|
|
|
table_opt.index_block_restart_interval, true /*use_delta_encoding*/,
|
|
|
|
use_value_delta_encoding,
|
|
|
|
BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */,
|
|
|
|
0.75 /* data_block_hash_table_util_ratio */, ts_sz,
|
|
|
|
persist_user_defined_timestamps, true /* is_user_key */),
|
|
|
|
sub_index_builder_(nullptr),
|
|
|
|
table_opt_(table_opt),
|
|
|
|
// We start by false. After each partition we revise the value based on
|
|
|
|
// what the sub_index_builder has decided. If the feature is disabled
|
|
|
|
// entirely, this will be set to true after switching the first
|
|
|
|
// sub_index_builder. Otherwise, it could be set to true even one of the
|
|
|
|
// sub_index_builders could not safely exclude seq from the keys, then it
|
|
|
|
// wil be enforced on all sub_index_builders on ::Finish.
|
|
|
|
seperator_is_key_plus_seq_(false),
|
|
|
|
use_value_delta_encoding_(use_value_delta_encoding) {}
|
|
|
|
|
|
|
|
PartitionedIndexBuilder::~PartitionedIndexBuilder() {
|
|
|
|
delete sub_index_builder_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
|
|
|
|
assert(sub_index_builder_ == nullptr);
|
|
|
|
sub_index_builder_ = new ShortenedIndexBuilder(
|
|
|
|
comparator_, table_opt_.index_block_restart_interval,
|
|
|
|
table_opt_.format_version, use_value_delta_encoding_,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
1 year ago
|
|
|
table_opt_.index_shortening, /* include_first_key */ false, ts_sz_,
|
|
|
|
persist_user_defined_timestamps_);
|
|
|
|
|
|
|
|
// Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
|
|
|
|
// seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
|
|
|
|
// default on Creation) so that flush policy can point to
|
|
|
|
// sub_index_builder_->index_block_builder_
|
|
|
|
if (seperator_is_key_plus_seq_) {
|
|
|
|
sub_index_builder_->seperator_is_key_plus_seq_ = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
|
|
|
|
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
|
|
|
|
// Note: this is sub-optimal since sub_index_builder_ could later reset
|
|
|
|
// seperator_is_key_plus_seq_ but the probability of that is low.
|
|
|
|
sub_index_builder_->seperator_is_key_plus_seq_
|
|
|
|
? sub_index_builder_->index_block_builder_
|
|
|
|
: sub_index_builder_->index_block_builder_without_seq_));
|
|
|
|
partition_cut_requested_ = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedIndexBuilder::RequestPartitionCut() {
|
|
|
|
partition_cut_requested_ = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartitionedIndexBuilder::AddIndexEntry(
|
|
|
|
std::string* last_key_in_current_block,
|
|
|
|
const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
|
|
|
|
// Note: to avoid two consecuitive flush in the same method call, we do not
|
|
|
|
// check flush policy when adding the last key
|
|
|
|
if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys
|
|
|
|
if (sub_index_builder_ == nullptr) {
|
|
|
|
MakeNewSubIndexBuilder();
|
|
|
|
}
|
|
|
|
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
|
|
|
|
first_key_in_next_block, block_handle);
|
|
|
|
if (!seperator_is_key_plus_seq_ &&
|
|
|
|
sub_index_builder_->seperator_is_key_plus_seq_) {
|
|
|
|
// then we need to apply it to all sub-index builders and reset
|
|
|
|
// flush_policy to point to Block Builder of sub_index_builder_ that store
|
|
|
|
// internal keys.
|
|
|
|
seperator_is_key_plus_seq_ = true;
|
|
|
|
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
|
|
|
|
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
|
|
|
|
sub_index_builder_->index_block_builder_));
|
|
|
|
}
|
|
|
|
sub_index_last_key_ = std::string(*last_key_in_current_block);
|
|
|
|
entries_.push_back(
|
|
|
|
{sub_index_last_key_,
|
|
|
|
std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
|
|
|
|
sub_index_builder_ = nullptr;
|
|
|
|
cut_filter_block = true;
|
|
|
|
} else {
|
|
|
|
// apply flush policy only to non-empty sub_index_builder_
|
|
|
|
if (sub_index_builder_ != nullptr) {
|
|
|
|
std::string handle_encoding;
|
|
|
|
block_handle.EncodeTo(&handle_encoding);
|
|
|
|
bool do_flush =
|
|
|
|
partition_cut_requested_ ||
|
|
|
|
flush_policy_->Update(*last_key_in_current_block, handle_encoding);
|
|
|
|
if (do_flush) {
|
|
|
|
entries_.push_back(
|
|
|
|
{sub_index_last_key_,
|
|
|
|
std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
|
|
|
|
cut_filter_block = true;
|
|
|
|
sub_index_builder_ = nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (sub_index_builder_ == nullptr) {
|
|
|
|
MakeNewSubIndexBuilder();
|
|
|
|
}
|
|
|
|
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
|
|
|
|
first_key_in_next_block, block_handle);
|
|
|
|
sub_index_last_key_ = std::string(*last_key_in_current_block);
|
|
|
|
if (!seperator_is_key_plus_seq_ &&
|
|
|
|
sub_index_builder_->seperator_is_key_plus_seq_) {
|
|
|
|
// then we need to apply it to all sub-index builders and reset
|
|
|
|
// flush_policy to point to Block Builder of sub_index_builder_ that store
|
|
|
|
// internal keys.
|
|
|
|
seperator_is_key_plus_seq_ = true;
|
|
|
|
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
|
|
|
|
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
|
|
|
|
sub_index_builder_->index_block_builder_));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status PartitionedIndexBuilder::Finish(
|
|
|
|
IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
|
|
|
|
if (partition_cnt_ == 0) {
|
|
|
|
partition_cnt_ = entries_.size();
|
|
|
|
}
|
|
|
|
// It must be set to null after last key is added
|
|
|
|
assert(sub_index_builder_ == nullptr);
|
|
|
|
if (finishing_indexes == true) {
|
|
|
|
Entry& last_entry = entries_.front();
|
|
|
|
std::string handle_encoding;
|
|
|
|
last_partition_block_handle.EncodeTo(&handle_encoding);
|
|
|
|
std::string handle_delta_encoding;
|
|
|
|
PutVarsignedint64(
|
|
|
|
&handle_delta_encoding,
|
|
|
|
last_partition_block_handle.size() - last_encoded_handle_.size());
|
|
|
|
last_encoded_handle_ = last_partition_block_handle;
|
|
|
|
const Slice handle_delta_encoding_slice(handle_delta_encoding);
|
|
|
|
index_block_builder_.Add(last_entry.key, handle_encoding,
|
|
|
|
&handle_delta_encoding_slice);
|
|
|
|
if (!seperator_is_key_plus_seq_) {
|
|
|
|
index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
|
|
|
|
handle_encoding,
|
|
|
|
&handle_delta_encoding_slice);
|
|
|
|
}
|
|
|
|
entries_.pop_front();
|
|
|
|
}
|
|
|
|
// If there is no sub_index left, then return the 2nd level index.
|
|
|
|
if (UNLIKELY(entries_.empty())) {
|
|
|
|
if (seperator_is_key_plus_seq_) {
|
|
|
|
index_blocks->index_block_contents = index_block_builder_.Finish();
|
|
|
|
} else {
|
|
|
|
index_blocks->index_block_contents =
|
|
|
|
index_block_builder_without_seq_.Finish();
|
|
|
|
}
|
|
|
|
top_level_index_size_ = index_blocks->index_block_contents.size();
|
|
|
|
index_size_ += top_level_index_size_;
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
// Finish the next partition index in line and Incomplete() to indicate we
|
|
|
|
// expect more calls to Finish
|
|
|
|
Entry& entry = entries_.front();
|
|
|
|
// Apply the policy to all sub-indexes
|
|
|
|
entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
|
|
|
|
auto s = entry.value->Finish(index_blocks);
|
|
|
|
index_size_ += index_blocks->index_block_contents.size();
|
|
|
|
finishing_indexes = true;
|
|
|
|
return s.ok() ? Status::Incomplete() : s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|