Exclude seq from index keys

Summary:
Index blocks have the same format as data blocks. The keys therefore similarly to the keys in the data blocks are internal keys, which means that in addition to the user key it also has 8 bytes that encodes sequence number and value type. This extra 8 bytes however is not necessary in index blocks since the index keys act as an separator between two data blocks. The only exception is when the last key of a block and the first key of the next block share the same user key, in which the sequence number is required to act as a separator.
The patch excludes the sequence from index keys only if the above special case does not happen for any of the index keys. It then records that in the property block. The reader looks at the property block to see if it should expect sequence numbers in the keys of the index block.s
Closes https://github.com/facebook/rocksdb/pull/3894

Differential Revision: D8118775

Pulled By: maysamyabandeh

fbshipit-source-id: 915479f028b5799ca91671d67455ecdefbd873bd
main
Maysam Yabandeh 6 years ago committed by Facebook Github Bot
parent 8c3bf0801b
commit 402b7aa07f
  1. 3
      HISTORY.md
  2. 28
      db/db_properties_test.cc
  3. 7
      include/rocksdb/table.h
  4. 4
      include/rocksdb/table_properties.h
  5. 47
      table/block.cc
  6. 50
      table/block.h
  7. 2
      table/block_based_table_builder.cc
  8. 151
      table/block_based_table_reader.cc
  9. 15
      table/block_based_table_reader.h
  10. 19
      table/block_test.cc
  11. 27
      table/index_builder.cc
  12. 58
      table/index_builder.h
  13. 14
      table/meta_blocks.cc
  14. 6
      table/partitioned_filter_block.cc
  15. 10
      table/table_properties.cc
  16. 10
      table/table_test.cc
  17. 7
      util/testutil.h

@ -4,6 +4,9 @@
* For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed.
* With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents.
### New Features
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
## 5.14.0 (5/16/2018)
### Public API Change
* Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages.

@ -177,17 +177,16 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
ResetTableProperties(tp);
sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64
" raw average key size %lf "
" raw value size %" SCNu64
" raw average value size %lf "
" data block size %" SCNu64 " index block size %" SCNu64
" filter block size %" SCNu64,
" data block size %" SCNu64 " index block size (user-key? %" SCNu64
") %" SCNu64 " filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size,
&dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
&tp->index_size, &tp->filter_size);
&tp->index_key_is_user_key, &tp->index_size, &tp->filter_size);
}
void VerifySimilar(uint64_t a, uint64_t b, double bias) {
@ -224,7 +223,8 @@ void GetExpectedTableProperties(TableProperties* expected_tp,
const int kKeySize, const int kValueSize,
const int kKeysPerTable, const int kTableCount,
const int kBloomBitsPerKey,
const size_t kBlockSize) {
const size_t kBlockSize,
const bool index_key_is_user_key) {
const int kKeyCount = kTableCount * kKeysPerTable;
const int kAvgSuccessorSize = kKeySize / 5;
const int kEncodingSavePerKey = kKeySize / 4;
@ -238,7 +238,8 @@ void GetExpectedTableProperties(TableProperties* expected_tp,
expected_tp->data_size =
kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
expected_tp->index_size =
expected_tp->num_data_blocks * (kAvgSuccessorSize + 8);
expected_tp->num_data_blocks *
(kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8));
expected_tp->filter_size =
kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
}
@ -315,14 +316,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
}
std::string property;
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
TableProperties output_tp;
ParseTablePropertiesString(property, &output_tp);
bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
TableProperties expected_tp;
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kTableCount, kBloomBitsPerKey,
table_options.block_size);
TableProperties output_tp;
ParseTablePropertiesString(property, &output_tp);
table_options.block_size, index_key_is_user_key);
VerifyTableProperties(expected_tp, output_tp);
}
@ -489,6 +490,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
}
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
ParseTablePropertiesString(tp_string, &tp);
bool index_key_is_user_key = tp.index_key_is_user_key > 0;
ASSERT_EQ(sum_tp.data_size, tp.data_size);
ASSERT_EQ(sum_tp.index_size, tp.index_size);
ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
@ -497,9 +499,9 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
if (table > 3) {
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, table, kBloomBitsPerKey,
table_options.block_size);
GetExpectedTableProperties(
&expected_tp, kKeySize, kValueSize, kKeysPerTable, table,
kBloomBitsPerKey, table_options.block_size, index_key_is_user_key);
// Gives larger bias here as index block size, filter block size,
// and data block size become much harder to estimate in this test.
VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25);

@ -214,8 +214,11 @@ struct BlockBasedTableOptions {
// encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
// don't plan to run RocksDB before version 3.10, you should probably use
// this.
// This option only affects newly written tables. When reading existing tables,
// the information about version is read from the footer.
// 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
// encode the keys in index blocks. If you don't plan to run RocksDB before
// version 5.15, you should probably use this.
// This option only affects newly written tables. When reading existing
// tables, the information about version is read from the footer.
uint32_t format_version = 2;
// Store index blocks on disk in compressed format. Changing this option to

@ -33,6 +33,7 @@ struct TablePropertiesNames {
static const std::string kIndexSize;
static const std::string kIndexPartitions;
static const std::string kTopLevelIndexSize;
static const std::string kIndexKeyIsUserKey;
static const std::string kFilterSize;
static const std::string kRawKeySize;
static const std::string kRawValueSize;
@ -134,6 +135,9 @@ struct TableProperties {
uint64_t index_partitions = 0;
// Size of the top-level index if kTwoLevelIndexSearch is used
uint64_t top_level_index_size = 0;
// Whether the index key is user key. Otherwise it includes 8 byte of sequence
// number added by internal key format.
uint64_t index_key_is_user_key = 0;
// the size of filter block.
uint64_t filter_size = 0;
// total raw key size

@ -87,7 +87,11 @@ void BlockIter::Prev() {
const Slice current_key(key_ptr, current_prev_entry.key_size);
current_ = current_prev_entry.offset;
key_.SetInternalKey(current_key, false /* copy */);
if (key_includes_seq_) {
key_.SetInternalKey(current_key, false /* copy */);
} else {
key_.SetUserKey(current_key, false /* copy */);
}
value_ = current_prev_entry.value;
return;
@ -136,6 +140,10 @@ void BlockIter::Prev() {
}
void BlockIter::Seek(const Slice& target) {
Slice seek_key = target;
if (!key_includes_seq_) {
seek_key = ExtractUserKey(target);
}
PERF_TIMER_GUARD(block_seek_nanos);
if (data_ == nullptr) { // Not init yet
return;
@ -145,7 +153,7 @@ void BlockIter::Seek(const Slice& target) {
if (prefix_index_) {
ok = PrefixSeek(target, &index);
} else {
ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index);
}
if (!ok) {
@ -155,7 +163,7 @@ void BlockIter::Seek(const Slice& target) {
// Linear search (within restart block) for first key >= target
while (true) {
if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) {
if (!ParseNextKey() || Compare(key_, seek_key) >= 0) {
return;
}
}
@ -163,24 +171,28 @@ void BlockIter::Seek(const Slice& target) {
void BlockIter::SeekForPrev(const Slice& target) {
PERF_TIMER_GUARD(block_seek_nanos);
Slice seek_key = target;
if (!key_includes_seq_) {
seek_key = ExtractUserKey(target);
}
if (data_ == nullptr) { // Not init yet
return;
}
uint32_t index = 0;
bool ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index);
if (!ok) {
return;
}
SeekToRestartPoint(index);
// Linear search (within restart block) for first key >= target
// Linear search (within restart block) for first key >= seek_key
while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) {
while (ParseNextKey() && Compare(key_, seek_key) < 0) {
}
if (!Valid()) {
SeekToLast();
} else {
while (Valid() && Compare(key_.GetInternalKey(), target) > 0) {
while (Valid() && Compare(key_, seek_key) > 0) {
Prev();
}
}
@ -233,7 +245,11 @@ bool BlockIter::ParseNextKey() {
if (shared == 0) {
// If this key dont share any bytes with prev key then we dont need
// to decode it and can use it's address in the block directly.
key_.SetInternalKey(Slice(p, non_shared), false /* copy */);
if (key_includes_seq_) {
key_.SetInternalKey(Slice(p, non_shared), false /* copy */);
} else {
key_.SetUserKey(Slice(p, non_shared), false /* copy */);
}
key_pinned_ = true;
} else {
// This key share `shared` bytes with prev key, we need to decode it
@ -380,6 +396,10 @@ bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
assert(prefix_index_);
Slice seek_key = target;
if (!key_includes_seq_) {
seek_key = ExtractUserKey(target);
}
uint32_t* block_ids = nullptr;
uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
@ -387,7 +407,7 @@ bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
current_ = restarts_;
return false;
} else {
return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index);
return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index);
}
}
@ -422,8 +442,9 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
}
}
BlockIter* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
bool total_order_seek, Statistics* stats) {
BlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
BlockIter* iter, bool total_order_seek,
Statistics* stats, bool key_includes_seq) {
BlockIter* ret_iter;
if (iter != nullptr) {
ret_iter = iter;
@ -441,9 +462,9 @@ BlockIter* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
} else {
BlockPrefixIndex* prefix_index_ptr =
total_order_seek ? nullptr : prefix_index_.get();
ret_iter->Initialize(cmp, data_, restart_offset_, num_restarts_,
ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
prefix_index_ptr, global_seqno_,
read_amp_bitmap_.get());
read_amp_bitmap_.get(), key_includes_seq);
if (read_amp_bitmap_) {
if (read_amp_bitmap_->GetStatistics() != stats) {

@ -162,6 +162,9 @@ class Block {
// the iterator will simply be set as "invalid", rather than returning
// the key that is just pass the target key.
//
// If comparator is InternalKeyComparator, user_comparator is its user
// comparator; they are equal otherwise.
//
// If iter is null, return new Iterator
// If iter is not null, update this one and return it as Iterator*
//
@ -169,9 +172,11 @@ class Block {
// This option only applies for index block. For data block, hash_index_
// and prefix_index_ are null, so this option does not matter.
BlockIter* NewIterator(const Comparator* comparator,
const Comparator* user_comparator,
BlockIter* iter = nullptr,
bool total_order_seek = true,
Statistics* stats = nullptr);
Statistics* stats = nullptr,
bool key_includes_seq = true);
void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
// Report an approximation of how much memory has been used.
@ -203,6 +208,7 @@ class BlockIter final : public InternalIterator {
// and status() is OK.
BlockIter()
: comparator_(nullptr),
user_comparator_(nullptr),
data_(nullptr),
restarts_(0),
num_restarts_(0),
@ -211,26 +217,30 @@ class BlockIter final : public InternalIterator {
status_(Status::OK()),
prefix_index_(nullptr),
key_pinned_(false),
key_includes_seq_(true),
global_seqno_(kDisableGlobalSequenceNumber),
read_amp_bitmap_(nullptr),
last_bitmap_offset_(0) {}
BlockIter(const Comparator* comparator, const char* data, uint32_t restarts,
uint32_t num_restarts, BlockPrefixIndex* prefix_index,
SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap)
BlockIter(const Comparator* comparator, const Comparator* user_comparator,
const char* data, uint32_t restarts, uint32_t num_restarts,
BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq)
: BlockIter() {
Initialize(comparator, data, restarts, num_restarts, prefix_index,
global_seqno, read_amp_bitmap);
Initialize(comparator, user_comparator, data, restarts, num_restarts,
prefix_index, global_seqno, read_amp_bitmap, key_includes_seq);
}
void Initialize(const Comparator* comparator, const char* data,
void Initialize(const Comparator* comparator,
const Comparator* user_comparator, const char* data,
uint32_t restarts, uint32_t num_restarts,
BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap) {
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) {
assert(data_ == nullptr); // Ensure it is called only once
assert(num_restarts > 0); // Ensure the param is valid
comparator_ = comparator;
user_comparator_ = user_comparator;
data_ = data;
restarts_ = restarts;
num_restarts_ = num_restarts;
@ -240,6 +250,7 @@ class BlockIter final : public InternalIterator {
global_seqno_ = global_seqno;
read_amp_bitmap_ = read_amp_bitmap;
last_bitmap_offset_ = current_ + 1;
key_includes_seq_ = key_includes_seq;
}
// Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
@ -263,7 +274,7 @@ class BlockIter final : public InternalIterator {
virtual Status status() const override { return status_; }
virtual Slice key() const override {
assert(Valid());
return key_.GetInternalKey();
return key_includes_seq_ ? key_.GetInternalKey() : key_.GetUserKey();
}
virtual Slice value() const override {
assert(Valid());
@ -312,7 +323,11 @@ class BlockIter final : public InternalIterator {
}
private:
// Note: The type could be changed to InternalKeyComparator but we see a weird
// performance drop by that.
const Comparator* comparator_;
// Same as comparator_ if comparator_ is not InernalKeyComparator
const Comparator* user_comparator_;
const char* data_; // underlying block contents
uint32_t restarts_; // Offset of restart array (list of fixed32)
uint32_t num_restarts_; // Number of uint32_t entries in restart array
@ -325,8 +340,11 @@ class BlockIter final : public InternalIterator {
Status status_;
BlockPrefixIndex* prefix_index_;
bool key_pinned_;
// Key is in InternalKey format
bool key_includes_seq_;
SequenceNumber global_seqno_;
public:
// read-amp bitmap
BlockReadAmpBitmap* read_amp_bitmap_;
// last `current_` value we report to read-amp bitmp
@ -357,7 +375,19 @@ class BlockIter final : public InternalIterator {
int32_t prev_entries_idx_ = -1;
inline int Compare(const Slice& a, const Slice& b) const {
return comparator_->Compare(a, b);
if (key_includes_seq_) {
return comparator_->Compare(a, b);
} else {
return user_comparator_->Compare(a, b);
}
}
inline int Compare(const IterKey& ikey, const Slice& b) const {
if (key_includes_seq_) {
return comparator_->Compare(ikey.GetInternalKey(), b);
} else {
return user_comparator_->Compare(ikey.GetUserKey(), b);
}
}
// Return the offset in data_ just past the end of the current entry.

@ -763,6 +763,8 @@ Status BlockBasedTableBuilder::Finish() {
r->props.top_level_index_size =
r->p_index_builder_->EstimateTopLevelIndexSize(r->offset);
}
r->props.index_key_is_user_key =
!r->index_builder->seperator_is_key_plus_seq();
r->props.creation_time = r->creation_time;
r->props.oldest_key_time = r->oldest_key_time;

@ -212,7 +212,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
const InternalKeyComparator* icomparator,
IndexReader** index_reader,
const PersistentCacheOptions& cache_options,
const int level) {
const int level, const bool index_key_includes_seq) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -221,9 +221,9 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
if (s.ok()) {
*index_reader =
new PartitionIndexReader(table, icomparator, std::move(index_block),
ioptions.statistics, level);
*index_reader = new PartitionIndexReader(
table, icomparator, std::move(index_block), ioptions.statistics,
level, index_key_includes_seq);
}
return s;
@ -237,15 +237,19 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
if (!partition_map_.empty()) {
return NewTwoLevelIterator(
new BlockBasedTable::PartitionedIndexIteratorState(
table_, partition_map_.size() ? &partition_map_ : nullptr),
index_block_->NewIterator(icomparator_, nullptr, true));
table_, &partition_map_, index_key_includes_seq_),
index_block_->NewIterator(
icomparator_, icomparator_->user_comparator(), nullptr, true));
} else {
auto ro = ReadOptions();
ro.fill_cache = fill_cache;
bool kIsIndex = true;
return new BlockBasedTableIterator(
table_, ro, *icomparator_,
index_block_->NewIterator(icomparator_, nullptr, true), false,
/* prefix_extractor */ nullptr);
index_block_->NewIterator(
icomparator_, icomparator_->user_comparator(), nullptr, true),
false,
/* prefix_extractor */ nullptr, kIsIndex, index_key_includes_seq_);
}
// TODO(myabandeh): Update TwoLevelIterator to be able to make use of
// on-stack BlockIter while the state is on heap. Currentlly it assumes
@ -258,7 +262,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
auto rep = table_->rep_;
BlockIter biter;
BlockHandle handle;
index_block_->NewIterator(icomparator_, &biter, true);
index_block_->NewIterator(icomparator_, icomparator_->user_comparator(),
&biter, true);
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();
@ -347,16 +352,18 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
PartitionIndexReader(BlockBasedTable* table,
const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block, Statistics* stats,
const int /*level*/)
const int /*level*/, const bool index_key_includes_seq)
: IndexReader(icomparator, stats),
table_(table),
index_block_(std::move(index_block)) {
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
assert(index_block_ != nullptr);
}
BlockBasedTable* table_;
std::unique_ptr<Block> index_block_;
std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
partition_map_;
const bool index_key_includes_seq_;
};
// Index that allows binary search lookup for the first key of each block.
@ -374,7 +381,8 @@ class BinarySearchIndexReader : public IndexReader {
const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icomparator,
IndexReader** index_reader,
const PersistentCacheOptions& cache_options) {
const PersistentCacheOptions& cache_options,
const bool index_key_includes_seq) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -384,7 +392,8 @@ class BinarySearchIndexReader : public IndexReader {
if (s.ok()) {
*index_reader = new BinarySearchIndexReader(
icomparator, std::move(index_block), ioptions.statistics);
icomparator, std::move(index_block), ioptions.statistics,
index_key_includes_seq);
}
return s;
@ -393,7 +402,9 @@ class BinarySearchIndexReader : public IndexReader {
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
bool /*dont_care*/ = true,
bool /*dont_care*/ = true) override {
return index_block_->NewIterator(icomparator_, iter, true);
return index_block_->NewIterator(icomparator_,
icomparator_->user_comparator(), iter,
true, nullptr, index_key_includes_seq_);
}
virtual size_t size() const override { return index_block_->size(); }
@ -409,11 +420,14 @@ class BinarySearchIndexReader : public IndexReader {
private:
BinarySearchIndexReader(const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block,
Statistics* stats)
: IndexReader(icomparator, stats), index_block_(std::move(index_block)) {
Statistics* stats, const bool index_key_includes_seq)
: IndexReader(icomparator, stats),
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
assert(index_block_ != nullptr);
}
std::unique_ptr<Block> index_block_;
const bool index_key_includes_seq_;
};
// Index that leverages an internal hash table to quicken the lookup for a given
@ -429,7 +443,8 @@ class HashIndexReader : public IndexReader {
InternalIterator* meta_index_iter,
IndexReader** index_reader,
bool /*hash_index_allow_collision*/,
const PersistentCacheOptions& cache_options) {
const PersistentCacheOptions& cache_options,
const bool index_key_includes_seq) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -447,7 +462,7 @@ class HashIndexReader : public IndexReader {
auto new_index_reader =
new HashIndexReader(icomparator, std::move(index_block),
ioptions.statistics);
ioptions.statistics, index_key_includes_seq);
*index_reader = new_index_reader;
// Get prefixes block
@ -504,7 +519,9 @@ class HashIndexReader : public IndexReader {
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
bool total_order_seek = true,
bool /*dont_care*/ = true) override {
return index_block_->NewIterator(icomparator_, iter, total_order_seek);
return index_block_->NewIterator(
icomparator_, icomparator_->user_comparator(), iter, total_order_seek,
nullptr, index_key_includes_seq_);
}
virtual size_t size() const override { return index_block_->size(); }
@ -520,8 +537,11 @@ class HashIndexReader : public IndexReader {
private:
HashIndexReader(const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block, Statistics* stats)
: IndexReader(icomparator, stats), index_block_(std::move(index_block)) {
std::unique_ptr<Block>&& index_block, Statistics* stats,
const bool index_key_includes_seq)
: IndexReader(icomparator, stats),
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
assert(index_block_ != nullptr);
}
@ -530,6 +550,7 @@ class HashIndexReader : public IndexReader {
std::unique_ptr<Block> index_block_;
BlockContents prefixes_contents_;
const bool index_key_includes_seq_;
};
// Helper function to setup the cache key's prefix for the Table.
@ -1026,7 +1047,8 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
*meta_block = std::move(meta);
// meta block uses bytewise comparator.
iter->reset(meta_block->get()->NewIterator(BytewiseComparator()));
iter->reset(meta_block->get()->NewIterator(BytewiseComparator(),
BytewiseComparator()));
return Status::OK();
}
@ -1502,14 +1524,15 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
BlockIter* BlockBasedTable::NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const Slice& index_value,
BlockIter* input_iter, bool is_index, GetContext* get_context) {
BlockIter* input_iter, bool is_index, bool key_includes_seq,
GetContext* get_context) {
BlockHandle handle;
Slice input = index_value;
// We intentionally allow extra stuff in index_value so that we
// can add more features in the future.
Status s = handle.DecodeFrom(&input);
return NewDataBlockIterator(rep, ro, handle, input_iter, is_index,
get_context, s);
key_includes_seq, get_context, s);
}
// Convert an index iterator value (i.e., an encoded BlockHandle)
@ -1518,7 +1541,8 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
// If input_iter is not null, update this iter and return it
BlockIter* BlockBasedTable::NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
BlockIter* input_iter, bool is_index, GetContext* get_context, Status s) {
BlockIter* input_iter, bool is_index, bool key_includes_seq,
GetContext* get_context, Status s) {
PERF_TIMER_GUARD(new_table_block_iter_nanos);
const bool no_io = (ro.read_tier == kBlockCacheTier);
@ -1564,8 +1588,9 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
if (s.ok()) {
assert(block.value != nullptr);
iter = block.value->NewIterator(&rep->internal_comparator, iter, true,
rep->ioptions.statistics);
iter = block.value->NewIterator(
&rep->internal_comparator, rep->internal_comparator.user_comparator(),
iter, true, rep->ioptions.statistics, key_includes_seq);
if (block.cache_handle != nullptr) {
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
block.cache_handle);
@ -1677,8 +1702,11 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
BlockBasedTable* table,
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
: table_(table), block_map_(block_map) {}
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
bool index_key_includes_seq)
: table_(table),
block_map_(block_map),
index_key_includes_seq_(index_key_includes_seq) {}
const size_t BlockBasedTableIterator::kMaxReadaheadSize = 256 * 1024;
@ -1701,8 +1729,9 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
assert(block_cache);
RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
block_cache->GetUsage(block->second.cache_handle));
return block->second.value->NewIterator(&rep->internal_comparator, nullptr,
true, rep->ioptions.statistics);
return block->second.value->NewIterator(
&rep->internal_comparator, rep->internal_comparator.user_comparator(),
nullptr, true, rep->ioptions.statistics, index_key_includes_seq_);
}
// Create an empty iterator
return new BlockIter();
@ -1770,7 +1799,9 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key,
// and we're not really sure that we're past the end
// of the file
may_match = iiter->status().IsIncomplete();
} else if (ExtractUserKey(iiter->key())
} else if ((rep_->table_properties->index_key_is_user_key
? iiter->key()
: ExtractUserKey(iiter->key()))
.starts_with(ExtractUserKey(internal_prefix))) {
// we need to check for this subtle case because our only
// guarantee is that "the key is a string >= last key in that data
@ -1836,7 +1867,11 @@ void BlockBasedTableIterator::Seek(const Slice& target) {
FindKeyForward();
assert(!data_block_iter_.Valid() ||
icomp_.Compare(target, data_block_iter_.key()) <= 0);
(key_includes_seq_ &&
icomp_.Compare(target, data_block_iter_.key()) <= 0) ||
(!key_includes_seq_ &&
icomp_.user_comparator()->Compare(ExtractUserKey(target),
data_block_iter_.key()) <= 0));
}
void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
@ -1952,7 +1987,8 @@ void BlockBasedTableIterator::InitDataBlock() {
}
BlockBasedTable::NewDataBlockIterator(rep, read_options_, data_block_handle,
&data_block_iter_, false,
&data_block_iter_, is_index_,
key_includes_seq_,
/* get_context */ nullptr, s);
block_iter_points_to_real_block_ = true;
}
@ -2024,24 +2060,25 @@ InternalIterator* BlockBasedTable::NewIterator(
Arena* arena, bool skip_filters) {
bool prefix_extractor_changed =
PrefixExtractorChanged(rep_->table_properties, prefix_extractor);
const bool kIsNotIndex = false;
if (arena == nullptr) {
return new BlockBasedTableIterator(
this, read_options, rep_->internal_comparator,
NewIndexIterator(
read_options,
prefix_extractor_changed &&
rep_->index_type == BlockBasedTableOptions::kHashSearch),
rep_->index_type == BlockBasedTableOptions::kHashSearch),
!skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr && !prefix_extractor_changed,
prefix_extractor);
prefix_extractor != nullptr && !prefix_extractor_changed,
prefix_extractor, kIsNotIndex);
} else {
auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
return new (mem) BlockBasedTableIterator(
this, read_options, rep_->internal_comparator,
NewIndexIterator(read_options, prefix_extractor_changed),
!skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr && !prefix_extractor_changed,
prefix_extractor);
prefix_extractor != nullptr && !prefix_extractor_changed,
prefix_extractor, kIsNotIndex);
}
}
@ -2061,7 +2098,8 @@ InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
assert(block_cache != nullptr);
if (block_cache->Ref(rep_->range_del_entry.cache_handle)) {
auto iter = rep_->range_del_entry.value->NewIterator(
&rep_->internal_comparator, nullptr /* iter */,
&rep_->internal_comparator,
rep_->internal_comparator.user_comparator(), nullptr /* iter */,
true /* total_order_seek */, rep_->ioptions.statistics);
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
rep_->range_del_entry.cache_handle);
@ -2107,6 +2145,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
GetContext* get_context,
const SliceTransform* prefix_extractor,
bool skip_filters) {
assert(key.size() >= 8); // key must be internal key
Status s;
const bool no_io = read_options.read_tier == kBlockCacheTier;
CachableEntry<FilterBlockReader> filter_entry;
@ -2215,6 +2254,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
Status BlockBasedTable::Prefetch(const Slice* const begin,
const Slice* const end) {
auto& comparator = rep_->internal_comparator;
auto user_comparator = comparator.user_comparator();
// pre-condition
if (begin && end && comparator.Compare(*begin, *end) > 0) {
return Status::InvalidArgument(*begin, *end);
@ -2238,8 +2278,11 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
iiter->Next()) {
Slice block_handle = iiter->value();
if (end && comparator.Compare(iiter->key(), *end) >= 0) {
const bool is_user_key = rep_->table_properties->index_key_is_user_key > 0;
if (end &&
((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
(is_user_key &&
user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
if (prefetching_boundary_page) {
break;
}
@ -2392,12 +2435,14 @@ Status BlockBasedTable::CreateIndexReader(
return PartitionIndexReader::Create(
this, file, prefetch_buffer, footer, footer.index_handle(),
rep_->ioptions, icomparator, index_reader,
rep_->persistent_cache_options, level);
rep_->persistent_cache_options, level,
rep_->table_properties->index_key_is_user_key == 0);
}
case BlockBasedTableOptions::kBinarySearch: {
return BinarySearchIndexReader::Create(
file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
icomparator, index_reader, rep_->persistent_cache_options);
icomparator, index_reader, rep_->persistent_cache_options,
rep_->table_properties->index_key_is_user_key == 0);
}
case BlockBasedTableOptions::kHashSearch: {
std::unique_ptr<Block> meta_guard;
@ -2415,7 +2460,8 @@ Status BlockBasedTable::CreateIndexReader(
return BinarySearchIndexReader::Create(
file, prefetch_buffer, footer, footer.index_handle(),
rep_->ioptions, icomparator, index_reader,
rep_->persistent_cache_options);
rep_->persistent_cache_options,
rep_->table_properties->index_key_is_user_key == 0);
}
meta_index_iter = meta_iter_guard.get();
}
@ -2424,7 +2470,8 @@ Status BlockBasedTable::CreateIndexReader(
rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer,
rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter,
index_reader, rep_->hash_index_allow_collision,
rep_->persistent_cache_options);
rep_->persistent_cache_options,
rep_->table_properties->index_key_is_user_key == 0);
}
default: {
std::string error_message =
@ -2709,16 +2756,22 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
break;
}
Slice key = blockhandles_iter->key();
Slice user_key;
InternalKey ikey;
ikey.DecodeFrom(key);
if (rep_->table_properties->index_key_is_user_key == 0) {
ikey.DecodeFrom(key);
user_key = ikey.user_key();
} else {
user_key = key;
}
out_file->Append(" HEX ");
out_file->Append(ikey.user_key().ToString(true).c_str());
out_file->Append(user_key.ToString(true).c_str());
out_file->Append(": ");
out_file->Append(blockhandles_iter->value().ToString(true).c_str());
out_file->Append("\n");
std::string str_key = ikey.user_key().ToString();
std::string str_key = user_key.ToString();
std::string res_key("");
char cspace = ' ';
for (size_t i = 0; i < str_key.size(); i++) {

@ -217,11 +217,13 @@ class BlockBasedTable : public TableReader {
const Slice& index_value,
BlockIter* input_iter = nullptr,
bool is_index = false,
bool key_includes_seq = true,
GetContext* get_context = nullptr);
static BlockIter* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
const BlockHandle& block_hanlde,
BlockIter* input_iter = nullptr,
bool is_index = false,
bool key_includes_seq = true,
GetContext* get_context = nullptr,
Status s = Status());
@ -378,13 +380,15 @@ class BlockBasedTable::PartitionedIndexIteratorState
public:
PartitionedIndexIteratorState(
BlockBasedTable* table,
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map = nullptr);
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
const bool index_key_includes_seq);
InternalIterator* NewSecondaryIterator(const Slice& index_value) override;
private:
// Don't own table_
BlockBasedTable* table_;
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
bool index_key_includes_seq_;
};
// CachableEntry represents the entries that *may* be fetched from block cache.
@ -509,7 +513,8 @@ class BlockBasedTableIterator : public InternalIterator {
const ReadOptions& read_options,
const InternalKeyComparator& icomp,
InternalIterator* index_iter, bool check_filter,
const SliceTransform* prefix_extractor)
const SliceTransform* prefix_extractor, bool is_index,
bool key_includes_seq = true)
: table_(table),
read_options_(read_options),
icomp_(icomp),
@ -517,6 +522,8 @@ class BlockBasedTableIterator : public InternalIterator {
pinned_iters_mgr_(nullptr),
block_iter_points_to_real_block_(false),
check_filter_(check_filter),
is_index_(is_index),
key_includes_seq_(key_includes_seq),
prefix_extractor_(prefix_extractor) {}
~BlockBasedTableIterator() { delete index_iter_; }
@ -609,6 +616,10 @@ class BlockBasedTableIterator : public InternalIterator {
bool block_iter_points_to_real_block_;
bool is_out_of_bound_ = false;
bool check_filter_;
// If the blocks over which we iterate are index blocks
bool is_index_;
// If the keys in the blocks over which we iterate include 8 byte sequence
bool key_includes_seq_;
// TODO use block offset instead
std::string prev_index_value_;
const SliceTransform* prefix_extractor_;

@ -99,7 +99,8 @@ TEST_F(BlockTest, SimpleTest) {
// read contents of block sequentially
int count = 0;
InternalIterator *iter = reader.NewIterator(options.comparator);
InternalIterator *iter =
reader.NewIterator(options.comparator, options.comparator);
for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) {
// read kv from block
@ -113,7 +114,7 @@ TEST_F(BlockTest, SimpleTest) {
delete iter;
// read block contents randomly
iter = reader.NewIterator(options.comparator);
iter = reader.NewIterator(options.comparator, options.comparator);
for (int i = 0; i < num_records; i++) {
// find a random key in the lookaside array
@ -163,7 +164,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
NewFixedPrefixTransform(prefix_size));
std::unique_ptr<InternalIterator> regular_iter(
reader2.NewIterator(BytewiseComparator()));
reader2.NewIterator(BytewiseComparator(), BytewiseComparator()));
// Seek existent keys
for (size_t i = 0; i < keys.size(); i++) {
@ -388,8 +389,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
// read contents of block sequentially
size_t read_bytes = 0;
BlockIter *iter = static_cast<BlockIter *>(
reader.NewIterator(options.comparator, nullptr, true, stats.get()));
BlockIter *iter = static_cast<BlockIter *>(reader.NewIterator(
options.comparator, options.comparator, nullptr, true, stats.get()));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
iter->value();
read_bytes += iter->TEST_CurrentEntrySize();
@ -421,8 +422,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
kBytesPerBit, stats.get());
size_t read_bytes = 0;
BlockIter *iter = static_cast<BlockIter *>(
reader.NewIterator(options.comparator, nullptr, true, stats.get()));
BlockIter *iter = static_cast<BlockIter *>(reader.NewIterator(
options.comparator, options.comparator, nullptr, true, stats.get()));
for (int i = 0; i < num_records; i++) {
Slice k(keys[i]);
@ -457,8 +458,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
kBytesPerBit, stats.get());
size_t read_bytes = 0;
BlockIter *iter = static_cast<BlockIter *>(
reader.NewIterator(options.comparator, nullptr, true, stats.get()));
BlockIter *iter = static_cast<BlockIter *>(reader.NewIterator(
options.comparator, options.comparator, nullptr, true, stats.get()));
std::unordered_set<int> read_keys;
for (int i = 0; i < num_records; i++) {
int index = rnd.Uniform(num_records);

@ -31,13 +31,15 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
IndexBuilder* result = nullptr;
switch (index_type) {
case BlockBasedTableOptions::kBinarySearch: {
result = new ShortenedIndexBuilder(comparator,
table_opt.index_block_restart_interval);
result = new ShortenedIndexBuilder(comparator,
table_opt.index_block_restart_interval,
table_opt.format_version);
}
break;
case BlockBasedTableOptions::kHashSearch: {
result = new HashIndexBuilder(comparator, int_key_slice_transform,
table_opt.index_block_restart_interval);
table_opt.index_block_restart_interval,
table_opt.format_version);
}
break;
case BlockBasedTableOptions::kTwoLevelIndexSearch: {
@ -62,9 +64,11 @@ PartitionedIndexBuilder::PartitionedIndexBuilder(
const InternalKeyComparator* comparator,
const BlockBasedTableOptions& table_opt)
: IndexBuilder(comparator),
index_block_builder_(table_opt.index_block_restart_interval),
index_block_builder_(table_opt.index_block_restart_interval,
table_opt.format_version),
sub_index_builder_(nullptr),
table_opt_(table_opt) {}
table_opt_(table_opt),
seperator_is_key_plus_seq_(false) {}
PartitionedIndexBuilder::~PartitionedIndexBuilder() {
delete sub_index_builder_;
@ -73,7 +77,8 @@ PartitionedIndexBuilder::~PartitionedIndexBuilder() {
void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
assert(sub_index_builder_ == nullptr);
sub_index_builder_ = new ShortenedIndexBuilder(
comparator_, table_opt_.index_block_restart_interval);
comparator_, table_opt_.index_block_restart_interval,
table_opt_.format_version);
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
sub_index_builder_->index_block_builder_));
@ -95,6 +100,10 @@ void PartitionedIndexBuilder::AddIndexEntry(
}
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
first_key_in_next_block, block_handle);
if (sub_index_builder_->seperator_is_key_plus_seq_) {
// then we need to apply it to all sub-index builders
seperator_is_key_plus_seq_ = true;
}
sub_index_last_key_ = std::string(*last_key_in_current_block);
entries_.push_back(
{sub_index_last_key_,
@ -123,6 +132,10 @@ void PartitionedIndexBuilder::AddIndexEntry(
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
first_key_in_next_block, block_handle);
sub_index_last_key_ = std::string(*last_key_in_current_block);
if (sub_index_builder_->seperator_is_key_plus_seq_) {
// then we need to apply it to all sub-index builders
seperator_is_key_plus_seq_ = true;
}
}
}
@ -146,6 +159,8 @@ Status PartitionedIndexBuilder::Finish(
// Finish the next partition index in line and Incomplete() to indicate we
// expect more calls to Finish
Entry& entry = entries_.front();
// Apply the policy to all sub-indexes
entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
auto s = entry.value->Finish(index_blocks);
finishing_indexes = true;
return s.ok() ? Status::Incomplete() : s;

@ -99,6 +99,8 @@ class IndexBuilder {
// Get the estimated size for index block.
virtual size_t EstimatedSize() const = 0;
virtual bool seperator_is_key_plus_seq() { return true; }
protected:
const InternalKeyComparator* comparator_;
};
@ -115,9 +117,14 @@ class IndexBuilder {
class ShortenedIndexBuilder : public IndexBuilder {
public:
explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
int index_block_restart_interval)
int index_block_restart_interval,
uint32_t format_version)
: IndexBuilder(comparator),
index_block_builder_(index_block_restart_interval) {}
index_block_builder_(index_block_restart_interval),
index_block_builder_without_seq_(index_block_restart_interval) {
// Making the default true will disable the feature for old versions
seperator_is_key_plus_seq_ = (format_version <= 2);
}
virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
@ -125,31 +132,57 @@ class ShortenedIndexBuilder : public IndexBuilder {
if (first_key_in_next_block != nullptr) {
comparator_->FindShortestSeparator(last_key_in_current_block,
*first_key_in_next_block);
if (!seperator_is_key_plus_seq_ &&
comparator_->user_comparator()->Compare(
ExtractUserKey(*last_key_in_current_block),
ExtractUserKey(*first_key_in_next_block)) == 0) {
seperator_is_key_plus_seq_ = true;
}
} else {
comparator_->FindShortSuccessor(last_key_in_current_block);
}
auto sep = Slice(*last_key_in_current_block);
std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding);
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
index_block_builder_.Add(sep, handle_encoding);
if (!seperator_is_key_plus_seq_) {
index_block_builder_without_seq_.Add(ExtractUserKey(sep),
handle_encoding);
}
}
using IndexBuilder::Finish;
virtual Status Finish(
IndexBlocks* index_blocks,
const BlockHandle& /*last_partition_block_handle*/) override {
index_blocks->index_block_contents = index_block_builder_.Finish();
if (seperator_is_key_plus_seq_) {
index_blocks->index_block_contents = index_block_builder_.Finish();
} else {
index_blocks->index_block_contents =
index_block_builder_without_seq_.Finish();
}
return Status::OK();
}
virtual size_t EstimatedSize() const override {
return index_block_builder_.CurrentSizeEstimate();
if (seperator_is_key_plus_seq_) {
return index_block_builder_.CurrentSizeEstimate();
} else {
return index_block_builder_without_seq_.CurrentSizeEstimate();
}
}
virtual bool seperator_is_key_plus_seq() override {
return seperator_is_key_plus_seq_;
}
friend class PartitionedIndexBuilder;
private:
BlockBuilder index_block_builder_;
BlockBuilder index_block_builder_without_seq_;
bool seperator_is_key_plus_seq_;
};
// HashIndexBuilder contains a binary-searchable primary index and the
@ -183,9 +216,11 @@ class HashIndexBuilder : public IndexBuilder {
public:
explicit HashIndexBuilder(const InternalKeyComparator* comparator,
const SliceTransform* hash_key_extractor,
int index_block_restart_interval)
int index_block_restart_interval,
int format_version)
: IndexBuilder(comparator),
primary_index_builder_(comparator, index_block_restart_interval),
primary_index_builder_(comparator, index_block_restart_interval,
format_version),
hash_key_extractor_(hash_key_extractor) {}
virtual void AddIndexEntry(std::string* last_key_in_current_block,
@ -240,6 +275,10 @@ class HashIndexBuilder : public IndexBuilder {
prefix_meta_block_.size();
}
virtual bool seperator_is_key_plus_seq() override {
return primary_index_builder_.seperator_is_key_plus_seq();
}
private:
void FlushPendingPrefix() {
prefix_block_.append(pending_entry_prefix_.data(),
@ -316,6 +355,10 @@ class PartitionedIndexBuilder : public IndexBuilder {
// cutting the next partition
void RequestPartitionCut();
virtual bool seperator_is_key_plus_seq() override {
return seperator_is_key_plus_seq_;
}
private:
void MakeNewSubIndexBuilder();
@ -333,6 +376,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
// true if Finish is called once but not complete yet.
bool finishing_indexes = false;
const BlockBasedTableOptions& table_opt_;
bool seperator_is_key_plus_seq_;
// true if an external entity (such as filter partition builder) request
// cutting the next partition
bool partition_cut_requested_ = true;

@ -71,6 +71,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
}
Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
Add(TablePropertiesNames::kNumEntries, props.num_entries);
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
Add(TablePropertiesNames::kFilterSize, props.filter_size);
@ -192,7 +193,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
Block properties_block(std::move(block_contents),
kDisableGlobalSequenceNumber);
BlockIter iter;
properties_block.NewIterator(BytewiseComparator(), &iter);
properties_block.NewIterator(BytewiseComparator(), BytewiseComparator(),
&iter);
auto new_table_properties = new TableProperties();
// All pre-defined properties of type uint64_t
@ -203,6 +205,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
&new_table_properties->index_partitions},
{TablePropertiesNames::kTopLevelIndexSize,
&new_table_properties->top_level_index_size},
{TablePropertiesNames::kIndexKeyIsUserKey,
&new_table_properties->index_key_is_user_key},
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
{TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
{TablePropertiesNames::kRawValueSize,
@ -312,7 +316,7 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
Block metaindex_block(std::move(metaindex_contents),
kDisableGlobalSequenceNumber);
std::unique_ptr<InternalIterator> meta_iter(
metaindex_block.NewIterator(BytewiseComparator()));
metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator()));
// -- Read property block
bool found_properties_block = true;
@ -375,7 +379,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
kDisableGlobalSequenceNumber);
std::unique_ptr<InternalIterator> meta_iter;
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
meta_iter.reset(
metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator()));
return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
}
@ -416,7 +421,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
kDisableGlobalSequenceNumber);
std::unique_ptr<InternalIterator> meta_iter;
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
meta_iter.reset(
metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator()));
BlockHandle block_handle;
status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);

@ -113,7 +113,7 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
BlockIter biter;
BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true);
biter.SeekToFirst();
for (; biter.Valid(); biter.Next()) {
auto input = biter.value();
@ -207,7 +207,7 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
Slice PartitionedFilterBlockReader::GetFilterPartitionHandle(
const Slice& entry) {
BlockIter iter;
idx_on_fltr_blk_->NewIterator(&comparator_, &iter, true);
idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &iter, true);
iter.Seek(entry);
if (UNLIKELY(!iter.Valid())) {
return Slice();
@ -269,7 +269,7 @@ void PartitionedFilterBlockReader::CacheDependencies(
auto rep = table_->rep_;
BlockIter biter;
BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true);
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();

@ -90,7 +90,12 @@ std::string TableProperties::ToString(
prop_delim, kv_delim);
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
char index_block_size_str[80];
snprintf(index_block_size_str, sizeof(index_block_size_str),
"index block size (user-key? %d)",
static_cast<int>(index_key_is_user_key));
AppendProperty(result, index_block_size_str, index_size, prop_delim,
kv_delim);
if (index_partitions != 0) {
AppendProperty(result, "# index partitions", index_partitions, prop_delim,
kv_delim);
@ -155,6 +160,7 @@ void TableProperties::Add(const TableProperties& tp) {
index_size += tp.index_size;
index_partitions += tp.index_partitions;
top_level_index_size += tp.top_level_index_size;
index_key_is_user_key += tp.index_key_is_user_key;
filter_size += tp.filter_size;
raw_key_size += tp.raw_key_size;
raw_value_size += tp.raw_value_size;
@ -170,6 +176,8 @@ const std::string TablePropertiesNames::kIndexPartitions =
"rocksdb.index.partitions";
const std::string TablePropertiesNames::kTopLevelIndexSize =
"rocksdb.top-level.index.size";
const std::string TablePropertiesNames::kIndexKeyIsUserKey =
"rocksdb.index.key.is.user.key";
const std::string TablePropertiesNames::kFilterSize =
"rocksdb.filter.size";
const std::string TablePropertiesNames::kRawKeySize =

@ -237,7 +237,7 @@ class BlockConstructor: public Constructor {
}
virtual InternalIterator* NewIterator(
const SliceTransform* /*prefix_extractor*/) const override {
return block_->NewIterator(comparator_);
return block_->NewIterator(comparator_, comparator_);
}
private:
@ -2115,7 +2115,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
GetContext::kNotFound, user_key, &value, nullptr,
nullptr, nullptr, nullptr);
ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context,
ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
moptions4.prefix_extractor.get()));
ASSERT_STREQ(value.data(), "hello");
BlockCachePropertiesSnapshot props(options.statistics.get());
@ -2427,7 +2427,8 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
ASSERT_OK(c.Reopen(ioptions1, moptions1));
auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
for (const std::string& key : keys) {
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
}
c.ResetTableReader();
@ -2439,7 +2440,8 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
ASSERT_OK(c.Reopen(ioptions2, moptions2));
table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
for (const std::string& key : keys) {
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
}
c.ResetTableReader();
}

@ -87,13 +87,6 @@ class PlainInternalKeyComparator : public InternalKeyComparator {
virtual int Compare(const Slice& a, const Slice& b) const override {
return user_comparator()->Compare(a, b);
}
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const override {
user_comparator()->FindShortestSeparator(start, limit);
}
virtual void FindShortSuccessor(std::string* key) const override {
user_comparator()->FindShortSuccessor(key);
}
};
#endif

Loading…
Cancel
Save