fork of https://github.com/oxigraph/rocksdb and https://github.com/facebook/rocksdb for nextgraph and oxigraph
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
297 lines
13 KiB
297 lines
13 KiB
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "table/block_based/index_builder.h"
|
|
|
|
#include <assert.h>
|
|
|
|
#include <cinttypes>
|
|
#include <list>
|
|
#include <string>
|
|
|
|
#include "db/dbformat.h"
|
|
#include "rocksdb/comparator.h"
|
|
#include "rocksdb/flush_block_policy.h"
|
|
#include "table/block_based/partitioned_filter_block.h"
|
|
#include "table/format.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// Create a index builder based on its type.
|
|
IndexBuilder* IndexBuilder::CreateIndexBuilder(
|
|
BlockBasedTableOptions::IndexType index_type,
|
|
const InternalKeyComparator* comparator,
|
|
const InternalKeySliceTransform* int_key_slice_transform,
|
|
const bool use_value_delta_encoding,
|
|
const BlockBasedTableOptions& table_opt, size_t ts_sz,
|
|
const bool persist_user_defined_timestamps) {
|
|
IndexBuilder* result = nullptr;
|
|
switch (index_type) {
|
|
case BlockBasedTableOptions::kBinarySearch: {
|
|
result = new ShortenedIndexBuilder(
|
|
comparator, table_opt.index_block_restart_interval,
|
|
table_opt.format_version, use_value_delta_encoding,
|
|
table_opt.index_shortening, /* include_first_key */ false, ts_sz,
|
|
persist_user_defined_timestamps);
|
|
break;
|
|
}
|
|
case BlockBasedTableOptions::kHashSearch: {
|
|
// Currently kHashSearch is incompatible with index_block_restart_interval
|
|
// > 1
|
|
assert(table_opt.index_block_restart_interval == 1);
|
|
result = new HashIndexBuilder(
|
|
comparator, int_key_slice_transform,
|
|
table_opt.index_block_restart_interval, table_opt.format_version,
|
|
use_value_delta_encoding, table_opt.index_shortening, ts_sz,
|
|
persist_user_defined_timestamps);
|
|
break;
|
|
}
|
|
case BlockBasedTableOptions::kTwoLevelIndexSearch: {
|
|
result = PartitionedIndexBuilder::CreateIndexBuilder(
|
|
comparator, use_value_delta_encoding, table_opt, ts_sz,
|
|
persist_user_defined_timestamps);
|
|
break;
|
|
}
|
|
case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
|
|
result = new ShortenedIndexBuilder(
|
|
comparator, table_opt.index_block_restart_interval,
|
|
table_opt.format_version, use_value_delta_encoding,
|
|
table_opt.index_shortening, /* include_first_key */ true, ts_sz,
|
|
persist_user_defined_timestamps);
|
|
break;
|
|
}
|
|
default: {
|
|
assert(!"Do not recognize the index type ");
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void ShortenedIndexBuilder::FindShortestInternalKeySeparator(
|
|
const Comparator& comparator, std::string* start, const Slice& limit) {
|
|
// Attempt to shorten the user portion of the key
|
|
Slice user_start = ExtractUserKey(*start);
|
|
Slice user_limit = ExtractUserKey(limit);
|
|
std::string tmp(user_start.data(), user_start.size());
|
|
comparator.FindShortestSeparator(&tmp, user_limit);
|
|
if (tmp.size() <= user_start.size() &&
|
|
comparator.Compare(user_start, tmp) < 0) {
|
|
// User key has become shorter physically, but larger logically.
|
|
// Tack on the earliest possible number to the shortened user key.
|
|
PutFixed64(&tmp,
|
|
PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
|
|
assert(InternalKeyComparator(&comparator).Compare(*start, tmp) < 0);
|
|
assert(InternalKeyComparator(&comparator).Compare(tmp, limit) < 0);
|
|
start->swap(tmp);
|
|
}
|
|
}
|
|
|
|
void ShortenedIndexBuilder::FindShortInternalKeySuccessor(
|
|
const Comparator& comparator, std::string* key) {
|
|
Slice user_key = ExtractUserKey(*key);
|
|
std::string tmp(user_key.data(), user_key.size());
|
|
comparator.FindShortSuccessor(&tmp);
|
|
if (tmp.size() <= user_key.size() && comparator.Compare(user_key, tmp) < 0) {
|
|
// User key has become shorter physically, but larger logically.
|
|
// Tack on the earliest possible number to the shortened user key.
|
|
PutFixed64(&tmp,
|
|
PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
|
|
assert(InternalKeyComparator(&comparator).Compare(*key, tmp) < 0);
|
|
key->swap(tmp);
|
|
}
|
|
}
|
|
|
|
PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
|
|
const InternalKeyComparator* comparator,
|
|
const bool use_value_delta_encoding,
|
|
const BlockBasedTableOptions& table_opt, size_t ts_sz,
|
|
const bool persist_user_defined_timestamps) {
|
|
return new PartitionedIndexBuilder(comparator, table_opt,
|
|
use_value_delta_encoding, ts_sz,
|
|
persist_user_defined_timestamps);
|
|
}
|
|
|
|
PartitionedIndexBuilder::PartitionedIndexBuilder(
|
|
const InternalKeyComparator* comparator,
|
|
const BlockBasedTableOptions& table_opt,
|
|
const bool use_value_delta_encoding, size_t ts_sz,
|
|
const bool persist_user_defined_timestamps)
|
|
: IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps),
|
|
index_block_builder_(
|
|
table_opt.index_block_restart_interval, true /*use_delta_encoding*/,
|
|
use_value_delta_encoding,
|
|
BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */,
|
|
0.75 /* data_block_hash_table_util_ratio */, ts_sz,
|
|
persist_user_defined_timestamps, false /* is_user_key */),
|
|
index_block_builder_without_seq_(
|
|
table_opt.index_block_restart_interval, true /*use_delta_encoding*/,
|
|
use_value_delta_encoding,
|
|
BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */,
|
|
0.75 /* data_block_hash_table_util_ratio */, ts_sz,
|
|
persist_user_defined_timestamps, true /* is_user_key */),
|
|
sub_index_builder_(nullptr),
|
|
table_opt_(table_opt),
|
|
// We start by false. After each partition we revise the value based on
|
|
// what the sub_index_builder has decided. If the feature is disabled
|
|
// entirely, this will be set to true after switching the first
|
|
// sub_index_builder. Otherwise, it could be set to true even one of the
|
|
// sub_index_builders could not safely exclude seq from the keys, then it
|
|
// wil be enforced on all sub_index_builders on ::Finish.
|
|
seperator_is_key_plus_seq_(false),
|
|
use_value_delta_encoding_(use_value_delta_encoding) {}
|
|
|
|
PartitionedIndexBuilder::~PartitionedIndexBuilder() {
|
|
delete sub_index_builder_;
|
|
}
|
|
|
|
void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
|
|
assert(sub_index_builder_ == nullptr);
|
|
sub_index_builder_ = new ShortenedIndexBuilder(
|
|
comparator_, table_opt_.index_block_restart_interval,
|
|
table_opt_.format_version, use_value_delta_encoding_,
|
|
table_opt_.index_shortening, /* include_first_key */ false, ts_sz_,
|
|
persist_user_defined_timestamps_);
|
|
|
|
// Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
|
|
// seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
|
|
// default on Creation) so that flush policy can point to
|
|
// sub_index_builder_->index_block_builder_
|
|
if (seperator_is_key_plus_seq_) {
|
|
sub_index_builder_->seperator_is_key_plus_seq_ = true;
|
|
}
|
|
|
|
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
|
|
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
|
|
// Note: this is sub-optimal since sub_index_builder_ could later reset
|
|
// seperator_is_key_plus_seq_ but the probability of that is low.
|
|
sub_index_builder_->seperator_is_key_plus_seq_
|
|
? sub_index_builder_->index_block_builder_
|
|
: sub_index_builder_->index_block_builder_without_seq_));
|
|
partition_cut_requested_ = false;
|
|
}
|
|
|
|
void PartitionedIndexBuilder::RequestPartitionCut() {
|
|
partition_cut_requested_ = true;
|
|
}
|
|
|
|
void PartitionedIndexBuilder::AddIndexEntry(
|
|
std::string* last_key_in_current_block,
|
|
const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
|
|
// Note: to avoid two consecuitive flush in the same method call, we do not
|
|
// check flush policy when adding the last key
|
|
if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys
|
|
if (sub_index_builder_ == nullptr) {
|
|
MakeNewSubIndexBuilder();
|
|
}
|
|
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
|
|
first_key_in_next_block, block_handle);
|
|
if (!seperator_is_key_plus_seq_ &&
|
|
sub_index_builder_->seperator_is_key_plus_seq_) {
|
|
// then we need to apply it to all sub-index builders and reset
|
|
// flush_policy to point to Block Builder of sub_index_builder_ that store
|
|
// internal keys.
|
|
seperator_is_key_plus_seq_ = true;
|
|
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
|
|
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
|
|
sub_index_builder_->index_block_builder_));
|
|
}
|
|
sub_index_last_key_ = std::string(*last_key_in_current_block);
|
|
entries_.push_back(
|
|
{sub_index_last_key_,
|
|
std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
|
|
sub_index_builder_ = nullptr;
|
|
cut_filter_block = true;
|
|
} else {
|
|
// apply flush policy only to non-empty sub_index_builder_
|
|
if (sub_index_builder_ != nullptr) {
|
|
std::string handle_encoding;
|
|
block_handle.EncodeTo(&handle_encoding);
|
|
bool do_flush =
|
|
partition_cut_requested_ ||
|
|
flush_policy_->Update(*last_key_in_current_block, handle_encoding);
|
|
if (do_flush) {
|
|
entries_.push_back(
|
|
{sub_index_last_key_,
|
|
std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
|
|
cut_filter_block = true;
|
|
sub_index_builder_ = nullptr;
|
|
}
|
|
}
|
|
if (sub_index_builder_ == nullptr) {
|
|
MakeNewSubIndexBuilder();
|
|
}
|
|
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
|
|
first_key_in_next_block, block_handle);
|
|
sub_index_last_key_ = std::string(*last_key_in_current_block);
|
|
if (!seperator_is_key_plus_seq_ &&
|
|
sub_index_builder_->seperator_is_key_plus_seq_) {
|
|
// then we need to apply it to all sub-index builders and reset
|
|
// flush_policy to point to Block Builder of sub_index_builder_ that store
|
|
// internal keys.
|
|
seperator_is_key_plus_seq_ = true;
|
|
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
|
|
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
|
|
sub_index_builder_->index_block_builder_));
|
|
}
|
|
}
|
|
}
|
|
|
|
Status PartitionedIndexBuilder::Finish(
|
|
IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
|
|
if (partition_cnt_ == 0) {
|
|
partition_cnt_ = entries_.size();
|
|
}
|
|
// It must be set to null after last key is added
|
|
assert(sub_index_builder_ == nullptr);
|
|
if (finishing_indexes == true) {
|
|
Entry& last_entry = entries_.front();
|
|
std::string handle_encoding;
|
|
last_partition_block_handle.EncodeTo(&handle_encoding);
|
|
std::string handle_delta_encoding;
|
|
PutVarsignedint64(
|
|
&handle_delta_encoding,
|
|
last_partition_block_handle.size() - last_encoded_handle_.size());
|
|
last_encoded_handle_ = last_partition_block_handle;
|
|
const Slice handle_delta_encoding_slice(handle_delta_encoding);
|
|
index_block_builder_.Add(last_entry.key, handle_encoding,
|
|
&handle_delta_encoding_slice);
|
|
if (!seperator_is_key_plus_seq_) {
|
|
index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
|
|
handle_encoding,
|
|
&handle_delta_encoding_slice);
|
|
}
|
|
entries_.pop_front();
|
|
}
|
|
// If there is no sub_index left, then return the 2nd level index.
|
|
if (UNLIKELY(entries_.empty())) {
|
|
if (seperator_is_key_plus_seq_) {
|
|
index_blocks->index_block_contents = index_block_builder_.Finish();
|
|
} else {
|
|
index_blocks->index_block_contents =
|
|
index_block_builder_without_seq_.Finish();
|
|
}
|
|
top_level_index_size_ = index_blocks->index_block_contents.size();
|
|
index_size_ += top_level_index_size_;
|
|
return Status::OK();
|
|
} else {
|
|
// Finish the next partition index in line and Incomplete() to indicate we
|
|
// expect more calls to Finish
|
|
Entry& entry = entries_.front();
|
|
// Apply the policy to all sub-indexes
|
|
entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
|
|
auto s = entry.value->Finish(index_blocks);
|
|
index_size_ += index_blocks->index_block_contents.size();
|
|
finishing_indexes = true;
|
|
return s.ok() ? Status::Incomplete() : s;
|
|
}
|
|
}
|
|
|
|
size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|