Exclude seq from index keys

Summary:
Index blocks have the same format as data blocks. The keys therefore similarly to the keys in the data blocks are internal keys, which means that in addition to the user key it also has 8 bytes that encodes sequence number and value type. This extra 8 bytes however is not necessary in index blocks since the index keys act as an separator between two data blocks. The only exception is when the last key of a block and the first key of the next block share the same user key, in which the sequence number is required to act as a separator.
The patch excludes the sequence from index keys only if the above special case does not happen for any of the index keys. It then records that in the property block. The reader looks at the property block to see if it should expect sequence numbers in the keys of the index block.s
Closes https://github.com/facebook/rocksdb/pull/3894

Differential Revision: D8118775

Pulled By: maysamyabandeh

fbshipit-source-id: 915479f028b5799ca91671d67455ecdefbd873bd
main
Maysam Yabandeh 6 years ago committed by Facebook Github Bot
parent 8c3bf0801b
commit 402b7aa07f
  1. 3
      HISTORY.md
  2. 28
      db/db_properties_test.cc
  3. 7
      include/rocksdb/table.h
  4. 4
      include/rocksdb/table_properties.h
  5. 43
      table/block.cc
  6. 48
      table/block.h
  7. 2
      table/block_based_table_builder.cc
  8. 143
      table/block_based_table_reader.cc
  9. 15
      table/block_based_table_reader.h
  10. 19
      table/block_test.cc
  11. 25
      table/index_builder.cc
  12. 54
      table/index_builder.h
  13. 14
      table/meta_blocks.cc
  14. 6
      table/partitioned_filter_block.cc
  15. 10
      table/table_properties.cc
  16. 10
      table/table_test.cc
  17. 7
      util/testutil.h

@ -4,6 +4,9 @@
* For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed. * For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed.
* With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents. * With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents.
### New Features
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
## 5.14.0 (5/16/2018) ## 5.14.0 (5/16/2018)
### Public API Change ### Public API Change
* Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages. * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages.

@ -177,17 +177,16 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
std::replace(tp_string.begin(), tp_string.end(), ';', ' '); std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
std::replace(tp_string.begin(), tp_string.end(), '=', ' '); std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
ResetTableProperties(tp); ResetTableProperties(tp);
sscanf(tp_string.c_str(), sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64 "# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64
" raw average key size %lf " " raw average key size %lf "
" raw value size %" SCNu64 " raw value size %" SCNu64
" raw average value size %lf " " raw average value size %lf "
" data block size %" SCNu64 " index block size %" SCNu64 " data block size %" SCNu64 " index block size (user-key? %" SCNu64
" filter block size %" SCNu64, ") %" SCNu64 " filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size,
&dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
&tp->index_size, &tp->filter_size); &tp->index_key_is_user_key, &tp->index_size, &tp->filter_size);
} }
void VerifySimilar(uint64_t a, uint64_t b, double bias) { void VerifySimilar(uint64_t a, uint64_t b, double bias) {
@ -224,7 +223,8 @@ void GetExpectedTableProperties(TableProperties* expected_tp,
const int kKeySize, const int kValueSize, const int kKeySize, const int kValueSize,
const int kKeysPerTable, const int kTableCount, const int kKeysPerTable, const int kTableCount,
const int kBloomBitsPerKey, const int kBloomBitsPerKey,
const size_t kBlockSize) { const size_t kBlockSize,
const bool index_key_is_user_key) {
const int kKeyCount = kTableCount * kKeysPerTable; const int kKeyCount = kTableCount * kKeysPerTable;
const int kAvgSuccessorSize = kKeySize / 5; const int kAvgSuccessorSize = kKeySize / 5;
const int kEncodingSavePerKey = kKeySize / 4; const int kEncodingSavePerKey = kKeySize / 4;
@ -238,7 +238,8 @@ void GetExpectedTableProperties(TableProperties* expected_tp,
expected_tp->data_size = expected_tp->data_size =
kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
expected_tp->index_size = expected_tp->index_size =
expected_tp->num_data_blocks * (kAvgSuccessorSize + 8); expected_tp->num_data_blocks *
(kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8));
expected_tp->filter_size = expected_tp->filter_size =
kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
} }
@ -315,14 +316,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
} }
std::string property; std::string property;
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
TableProperties output_tp;
ParseTablePropertiesString(property, &output_tp);
bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
TableProperties expected_tp; TableProperties expected_tp;
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kTableCount, kBloomBitsPerKey, kKeysPerTable, kTableCount, kBloomBitsPerKey,
table_options.block_size); table_options.block_size, index_key_is_user_key);
TableProperties output_tp;
ParseTablePropertiesString(property, &output_tp);
VerifyTableProperties(expected_tp, output_tp); VerifyTableProperties(expected_tp, output_tp);
} }
@ -489,6 +490,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
} }
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
ParseTablePropertiesString(tp_string, &tp); ParseTablePropertiesString(tp_string, &tp);
bool index_key_is_user_key = tp.index_key_is_user_key > 0;
ASSERT_EQ(sum_tp.data_size, tp.data_size); ASSERT_EQ(sum_tp.data_size, tp.data_size);
ASSERT_EQ(sum_tp.index_size, tp.index_size); ASSERT_EQ(sum_tp.index_size, tp.index_size);
ASSERT_EQ(sum_tp.filter_size, tp.filter_size); ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
@ -497,9 +499,9 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
ASSERT_EQ(sum_tp.num_entries, tp.num_entries); ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
if (table > 3) { if (table > 3) {
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, GetExpectedTableProperties(
kKeysPerTable, table, kBloomBitsPerKey, &expected_tp, kKeySize, kValueSize, kKeysPerTable, table,
table_options.block_size); kBloomBitsPerKey, table_options.block_size, index_key_is_user_key);
// Gives larger bias here as index block size, filter block size, // Gives larger bias here as index block size, filter block size,
// and data block size become much harder to estimate in this test. // and data block size become much harder to estimate in this test.
VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25);

@ -214,8 +214,11 @@ struct BlockBasedTableOptions {
// encode compressed blocks with LZ4, BZip2 and Zlib compression. If you // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
// don't plan to run RocksDB before version 3.10, you should probably use // don't plan to run RocksDB before version 3.10, you should probably use
// this. // this.
// This option only affects newly written tables. When reading existing tables, // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
// the information about version is read from the footer. // encode the keys in index blocks. If you don't plan to run RocksDB before
// version 5.15, you should probably use this.
// This option only affects newly written tables. When reading existing
// tables, the information about version is read from the footer.
uint32_t format_version = 2; uint32_t format_version = 2;
// Store index blocks on disk in compressed format. Changing this option to // Store index blocks on disk in compressed format. Changing this option to

@ -33,6 +33,7 @@ struct TablePropertiesNames {
static const std::string kIndexSize; static const std::string kIndexSize;
static const std::string kIndexPartitions; static const std::string kIndexPartitions;
static const std::string kTopLevelIndexSize; static const std::string kTopLevelIndexSize;
static const std::string kIndexKeyIsUserKey;
static const std::string kFilterSize; static const std::string kFilterSize;
static const std::string kRawKeySize; static const std::string kRawKeySize;
static const std::string kRawValueSize; static const std::string kRawValueSize;
@ -134,6 +135,9 @@ struct TableProperties {
uint64_t index_partitions = 0; uint64_t index_partitions = 0;
// Size of the top-level index if kTwoLevelIndexSearch is used // Size of the top-level index if kTwoLevelIndexSearch is used
uint64_t top_level_index_size = 0; uint64_t top_level_index_size = 0;
// Whether the index key is user key. Otherwise it includes 8 byte of sequence
// number added by internal key format.
uint64_t index_key_is_user_key = 0;
// the size of filter block. // the size of filter block.
uint64_t filter_size = 0; uint64_t filter_size = 0;
// total raw key size // total raw key size

@ -87,7 +87,11 @@ void BlockIter::Prev() {
const Slice current_key(key_ptr, current_prev_entry.key_size); const Slice current_key(key_ptr, current_prev_entry.key_size);
current_ = current_prev_entry.offset; current_ = current_prev_entry.offset;
if (key_includes_seq_) {
key_.SetInternalKey(current_key, false /* copy */); key_.SetInternalKey(current_key, false /* copy */);
} else {
key_.SetUserKey(current_key, false /* copy */);
}
value_ = current_prev_entry.value; value_ = current_prev_entry.value;
return; return;
@ -136,6 +140,10 @@ void BlockIter::Prev() {
} }
void BlockIter::Seek(const Slice& target) { void BlockIter::Seek(const Slice& target) {
Slice seek_key = target;
if (!key_includes_seq_) {
seek_key = ExtractUserKey(target);
}
PERF_TIMER_GUARD(block_seek_nanos); PERF_TIMER_GUARD(block_seek_nanos);
if (data_ == nullptr) { // Not init yet if (data_ == nullptr) { // Not init yet
return; return;
@ -145,7 +153,7 @@ void BlockIter::Seek(const Slice& target) {
if (prefix_index_) { if (prefix_index_) {
ok = PrefixSeek(target, &index); ok = PrefixSeek(target, &index);
} else { } else {
ok = BinarySeek(target, 0, num_restarts_ - 1, &index); ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index);
} }
if (!ok) { if (!ok) {
@ -155,7 +163,7 @@ void BlockIter::Seek(const Slice& target) {
// Linear search (within restart block) for first key >= target // Linear search (within restart block) for first key >= target
while (true) { while (true) {
if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) { if (!ParseNextKey() || Compare(key_, seek_key) >= 0) {
return; return;
} }
} }
@ -163,24 +171,28 @@ void BlockIter::Seek(const Slice& target) {
void BlockIter::SeekForPrev(const Slice& target) { void BlockIter::SeekForPrev(const Slice& target) {
PERF_TIMER_GUARD(block_seek_nanos); PERF_TIMER_GUARD(block_seek_nanos);
Slice seek_key = target;
if (!key_includes_seq_) {
seek_key = ExtractUserKey(target);
}
if (data_ == nullptr) { // Not init yet if (data_ == nullptr) { // Not init yet
return; return;
} }
uint32_t index = 0; uint32_t index = 0;
bool ok = BinarySeek(target, 0, num_restarts_ - 1, &index); bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index);
if (!ok) { if (!ok) {
return; return;
} }
SeekToRestartPoint(index); SeekToRestartPoint(index);
// Linear search (within restart block) for first key >= target // Linear search (within restart block) for first key >= seek_key
while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) { while (ParseNextKey() && Compare(key_, seek_key) < 0) {
} }
if (!Valid()) { if (!Valid()) {
SeekToLast(); SeekToLast();
} else { } else {
while (Valid() && Compare(key_.GetInternalKey(), target) > 0) { while (Valid() && Compare(key_, seek_key) > 0) {
Prev(); Prev();
} }
} }
@ -233,7 +245,11 @@ bool BlockIter::ParseNextKey() {
if (shared == 0) { if (shared == 0) {
// If this key dont share any bytes with prev key then we dont need // If this key dont share any bytes with prev key then we dont need
// to decode it and can use it's address in the block directly. // to decode it and can use it's address in the block directly.
if (key_includes_seq_) {
key_.SetInternalKey(Slice(p, non_shared), false /* copy */); key_.SetInternalKey(Slice(p, non_shared), false /* copy */);
} else {
key_.SetUserKey(Slice(p, non_shared), false /* copy */);
}
key_pinned_ = true; key_pinned_ = true;
} else { } else {
// This key share `shared` bytes with prev key, we need to decode it // This key share `shared` bytes with prev key, we need to decode it
@ -380,6 +396,10 @@ bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
assert(prefix_index_); assert(prefix_index_);
Slice seek_key = target;
if (!key_includes_seq_) {
seek_key = ExtractUserKey(target);
}
uint32_t* block_ids = nullptr; uint32_t* block_ids = nullptr;
uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
@ -387,7 +407,7 @@ bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
current_ = restarts_; current_ = restarts_;
return false; return false;
} else { } else {
return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index);
} }
} }
@ -422,8 +442,9 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
} }
} }
BlockIter* Block::NewIterator(const Comparator* cmp, BlockIter* iter, BlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
bool total_order_seek, Statistics* stats) { BlockIter* iter, bool total_order_seek,
Statistics* stats, bool key_includes_seq) {
BlockIter* ret_iter; BlockIter* ret_iter;
if (iter != nullptr) { if (iter != nullptr) {
ret_iter = iter; ret_iter = iter;
@ -441,9 +462,9 @@ BlockIter* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
} else { } else {
BlockPrefixIndex* prefix_index_ptr = BlockPrefixIndex* prefix_index_ptr =
total_order_seek ? nullptr : prefix_index_.get(); total_order_seek ? nullptr : prefix_index_.get();
ret_iter->Initialize(cmp, data_, restart_offset_, num_restarts_, ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
prefix_index_ptr, global_seqno_, prefix_index_ptr, global_seqno_,
read_amp_bitmap_.get()); read_amp_bitmap_.get(), key_includes_seq);
if (read_amp_bitmap_) { if (read_amp_bitmap_) {
if (read_amp_bitmap_->GetStatistics() != stats) { if (read_amp_bitmap_->GetStatistics() != stats) {

@ -162,6 +162,9 @@ class Block {
// the iterator will simply be set as "invalid", rather than returning // the iterator will simply be set as "invalid", rather than returning
// the key that is just pass the target key. // the key that is just pass the target key.
// //
// If comparator is InternalKeyComparator, user_comparator is its user
// comparator; they are equal otherwise.
//
// If iter is null, return new Iterator // If iter is null, return new Iterator
// If iter is not null, update this one and return it as Iterator* // If iter is not null, update this one and return it as Iterator*
// //
@ -169,9 +172,11 @@ class Block {
// This option only applies for index block. For data block, hash_index_ // This option only applies for index block. For data block, hash_index_
// and prefix_index_ are null, so this option does not matter. // and prefix_index_ are null, so this option does not matter.
BlockIter* NewIterator(const Comparator* comparator, BlockIter* NewIterator(const Comparator* comparator,
const Comparator* user_comparator,
BlockIter* iter = nullptr, BlockIter* iter = nullptr,
bool total_order_seek = true, bool total_order_seek = true,
Statistics* stats = nullptr); Statistics* stats = nullptr,
bool key_includes_seq = true);
void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index); void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
// Report an approximation of how much memory has been used. // Report an approximation of how much memory has been used.
@ -203,6 +208,7 @@ class BlockIter final : public InternalIterator {
// and status() is OK. // and status() is OK.
BlockIter() BlockIter()
: comparator_(nullptr), : comparator_(nullptr),
user_comparator_(nullptr),
data_(nullptr), data_(nullptr),
restarts_(0), restarts_(0),
num_restarts_(0), num_restarts_(0),
@ -211,26 +217,30 @@ class BlockIter final : public InternalIterator {
status_(Status::OK()), status_(Status::OK()),
prefix_index_(nullptr), prefix_index_(nullptr),
key_pinned_(false), key_pinned_(false),
key_includes_seq_(true),
global_seqno_(kDisableGlobalSequenceNumber), global_seqno_(kDisableGlobalSequenceNumber),
read_amp_bitmap_(nullptr), read_amp_bitmap_(nullptr),
last_bitmap_offset_(0) {} last_bitmap_offset_(0) {}
BlockIter(const Comparator* comparator, const char* data, uint32_t restarts, BlockIter(const Comparator* comparator, const Comparator* user_comparator,
uint32_t num_restarts, BlockPrefixIndex* prefix_index, const char* data, uint32_t restarts, uint32_t num_restarts,
SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap) BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq)
: BlockIter() { : BlockIter() {
Initialize(comparator, data, restarts, num_restarts, prefix_index, Initialize(comparator, user_comparator, data, restarts, num_restarts,
global_seqno, read_amp_bitmap); prefix_index, global_seqno, read_amp_bitmap, key_includes_seq);
} }
void Initialize(const Comparator* comparator, const char* data, void Initialize(const Comparator* comparator,
const Comparator* user_comparator, const char* data,
uint32_t restarts, uint32_t num_restarts, uint32_t restarts, uint32_t num_restarts,
BlockPrefixIndex* prefix_index, SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap) { BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) {
assert(data_ == nullptr); // Ensure it is called only once assert(data_ == nullptr); // Ensure it is called only once
assert(num_restarts > 0); // Ensure the param is valid assert(num_restarts > 0); // Ensure the param is valid
comparator_ = comparator; comparator_ = comparator;
user_comparator_ = user_comparator;
data_ = data; data_ = data;
restarts_ = restarts; restarts_ = restarts;
num_restarts_ = num_restarts; num_restarts_ = num_restarts;
@ -240,6 +250,7 @@ class BlockIter final : public InternalIterator {
global_seqno_ = global_seqno; global_seqno_ = global_seqno;
read_amp_bitmap_ = read_amp_bitmap; read_amp_bitmap_ = read_amp_bitmap;
last_bitmap_offset_ = current_ + 1; last_bitmap_offset_ = current_ + 1;
key_includes_seq_ = key_includes_seq;
} }
// Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
@ -263,7 +274,7 @@ class BlockIter final : public InternalIterator {
virtual Status status() const override { return status_; } virtual Status status() const override { return status_; }
virtual Slice key() const override { virtual Slice key() const override {
assert(Valid()); assert(Valid());
return key_.GetInternalKey(); return key_includes_seq_ ? key_.GetInternalKey() : key_.GetUserKey();
} }
virtual Slice value() const override { virtual Slice value() const override {
assert(Valid()); assert(Valid());
@ -312,7 +323,11 @@ class BlockIter final : public InternalIterator {
} }
private: private:
// Note: The type could be changed to InternalKeyComparator but we see a weird
// performance drop by that.
const Comparator* comparator_; const Comparator* comparator_;
// Same as comparator_ if comparator_ is not InernalKeyComparator
const Comparator* user_comparator_;
const char* data_; // underlying block contents const char* data_; // underlying block contents
uint32_t restarts_; // Offset of restart array (list of fixed32) uint32_t restarts_; // Offset of restart array (list of fixed32)
uint32_t num_restarts_; // Number of uint32_t entries in restart array uint32_t num_restarts_; // Number of uint32_t entries in restart array
@ -325,8 +340,11 @@ class BlockIter final : public InternalIterator {
Status status_; Status status_;
BlockPrefixIndex* prefix_index_; BlockPrefixIndex* prefix_index_;
bool key_pinned_; bool key_pinned_;
// Key is in InternalKey format
bool key_includes_seq_;
SequenceNumber global_seqno_; SequenceNumber global_seqno_;
public:
// read-amp bitmap // read-amp bitmap
BlockReadAmpBitmap* read_amp_bitmap_; BlockReadAmpBitmap* read_amp_bitmap_;
// last `current_` value we report to read-amp bitmp // last `current_` value we report to read-amp bitmp
@ -357,7 +375,19 @@ class BlockIter final : public InternalIterator {
int32_t prev_entries_idx_ = -1; int32_t prev_entries_idx_ = -1;
inline int Compare(const Slice& a, const Slice& b) const { inline int Compare(const Slice& a, const Slice& b) const {
if (key_includes_seq_) {
return comparator_->Compare(a, b); return comparator_->Compare(a, b);
} else {
return user_comparator_->Compare(a, b);
}
}
inline int Compare(const IterKey& ikey, const Slice& b) const {
if (key_includes_seq_) {
return comparator_->Compare(ikey.GetInternalKey(), b);
} else {
return user_comparator_->Compare(ikey.GetUserKey(), b);
}
} }
// Return the offset in data_ just past the end of the current entry. // Return the offset in data_ just past the end of the current entry.

@ -763,6 +763,8 @@ Status BlockBasedTableBuilder::Finish() {
r->props.top_level_index_size = r->props.top_level_index_size =
r->p_index_builder_->EstimateTopLevelIndexSize(r->offset); r->p_index_builder_->EstimateTopLevelIndexSize(r->offset);
} }
r->props.index_key_is_user_key =
!r->index_builder->seperator_is_key_plus_seq();
r->props.creation_time = r->creation_time; r->props.creation_time = r->creation_time;
r->props.oldest_key_time = r->oldest_key_time; r->props.oldest_key_time = r->oldest_key_time;

@ -212,7 +212,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
const InternalKeyComparator* icomparator, const InternalKeyComparator* icomparator,
IndexReader** index_reader, IndexReader** index_reader,
const PersistentCacheOptions& cache_options, const PersistentCacheOptions& cache_options,
const int level) { const int level, const bool index_key_includes_seq) {
std::unique_ptr<Block> index_block; std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile( auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle, file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -221,9 +221,9 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */); kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
if (s.ok()) { if (s.ok()) {
*index_reader = *index_reader = new PartitionIndexReader(
new PartitionIndexReader(table, icomparator, std::move(index_block), table, icomparator, std::move(index_block), ioptions.statistics,
ioptions.statistics, level); level, index_key_includes_seq);
} }
return s; return s;
@ -237,15 +237,19 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
if (!partition_map_.empty()) { if (!partition_map_.empty()) {
return NewTwoLevelIterator( return NewTwoLevelIterator(
new BlockBasedTable::PartitionedIndexIteratorState( new BlockBasedTable::PartitionedIndexIteratorState(
table_, partition_map_.size() ? &partition_map_ : nullptr), table_, &partition_map_, index_key_includes_seq_),
index_block_->NewIterator(icomparator_, nullptr, true)); index_block_->NewIterator(
icomparator_, icomparator_->user_comparator(), nullptr, true));
} else { } else {
auto ro = ReadOptions(); auto ro = ReadOptions();
ro.fill_cache = fill_cache; ro.fill_cache = fill_cache;
bool kIsIndex = true;
return new BlockBasedTableIterator( return new BlockBasedTableIterator(
table_, ro, *icomparator_, table_, ro, *icomparator_,
index_block_->NewIterator(icomparator_, nullptr, true), false, index_block_->NewIterator(
/* prefix_extractor */ nullptr); icomparator_, icomparator_->user_comparator(), nullptr, true),
false,
/* prefix_extractor */ nullptr, kIsIndex, index_key_includes_seq_);
} }
// TODO(myabandeh): Update TwoLevelIterator to be able to make use of // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
// on-stack BlockIter while the state is on heap. Currentlly it assumes // on-stack BlockIter while the state is on heap. Currentlly it assumes
@ -258,7 +262,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
auto rep = table_->rep_; auto rep = table_->rep_;
BlockIter biter; BlockIter biter;
BlockHandle handle; BlockHandle handle;
index_block_->NewIterator(icomparator_, &biter, true); index_block_->NewIterator(icomparator_, icomparator_->user_comparator(),
&biter, true);
// Index partitions are assumed to be consecuitive. Prefetch them all. // Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset // Read the first block offset
biter.SeekToFirst(); biter.SeekToFirst();
@ -347,16 +352,18 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
PartitionIndexReader(BlockBasedTable* table, PartitionIndexReader(BlockBasedTable* table,
const InternalKeyComparator* icomparator, const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block, Statistics* stats, std::unique_ptr<Block>&& index_block, Statistics* stats,
const int /*level*/) const int /*level*/, const bool index_key_includes_seq)
: IndexReader(icomparator, stats), : IndexReader(icomparator, stats),
table_(table), table_(table),
index_block_(std::move(index_block)) { index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
assert(index_block_ != nullptr); assert(index_block_ != nullptr);
} }
BlockBasedTable* table_; BlockBasedTable* table_;
std::unique_ptr<Block> index_block_; std::unique_ptr<Block> index_block_;
std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>> std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
partition_map_; partition_map_;
const bool index_key_includes_seq_;
}; };
// Index that allows binary search lookup for the first key of each block. // Index that allows binary search lookup for the first key of each block.
@ -374,7 +381,8 @@ class BinarySearchIndexReader : public IndexReader {
const ImmutableCFOptions& ioptions, const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icomparator, const InternalKeyComparator* icomparator,
IndexReader** index_reader, IndexReader** index_reader,
const PersistentCacheOptions& cache_options) { const PersistentCacheOptions& cache_options,
const bool index_key_includes_seq) {
std::unique_ptr<Block> index_block; std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile( auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle, file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -384,7 +392,8 @@ class BinarySearchIndexReader : public IndexReader {
if (s.ok()) { if (s.ok()) {
*index_reader = new BinarySearchIndexReader( *index_reader = new BinarySearchIndexReader(
icomparator, std::move(index_block), ioptions.statistics); icomparator, std::move(index_block), ioptions.statistics,
index_key_includes_seq);
} }
return s; return s;
@ -393,7 +402,9 @@ class BinarySearchIndexReader : public IndexReader {
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
bool /*dont_care*/ = true, bool /*dont_care*/ = true,
bool /*dont_care*/ = true) override { bool /*dont_care*/ = true) override {
return index_block_->NewIterator(icomparator_, iter, true); return index_block_->NewIterator(icomparator_,
icomparator_->user_comparator(), iter,
true, nullptr, index_key_includes_seq_);
} }
virtual size_t size() const override { return index_block_->size(); } virtual size_t size() const override { return index_block_->size(); }
@ -409,11 +420,14 @@ class BinarySearchIndexReader : public IndexReader {
private: private:
BinarySearchIndexReader(const InternalKeyComparator* icomparator, BinarySearchIndexReader(const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block, std::unique_ptr<Block>&& index_block,
Statistics* stats) Statistics* stats, const bool index_key_includes_seq)
: IndexReader(icomparator, stats), index_block_(std::move(index_block)) { : IndexReader(icomparator, stats),
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
assert(index_block_ != nullptr); assert(index_block_ != nullptr);
} }
std::unique_ptr<Block> index_block_; std::unique_ptr<Block> index_block_;
const bool index_key_includes_seq_;
}; };
// Index that leverages an internal hash table to quicken the lookup for a given // Index that leverages an internal hash table to quicken the lookup for a given
@ -429,7 +443,8 @@ class HashIndexReader : public IndexReader {
InternalIterator* meta_index_iter, InternalIterator* meta_index_iter,
IndexReader** index_reader, IndexReader** index_reader,
bool /*hash_index_allow_collision*/, bool /*hash_index_allow_collision*/,
const PersistentCacheOptions& cache_options) { const PersistentCacheOptions& cache_options,
const bool index_key_includes_seq) {
std::unique_ptr<Block> index_block; std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile( auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle, file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -447,7 +462,7 @@ class HashIndexReader : public IndexReader {
auto new_index_reader = auto new_index_reader =
new HashIndexReader(icomparator, std::move(index_block), new HashIndexReader(icomparator, std::move(index_block),
ioptions.statistics); ioptions.statistics, index_key_includes_seq);
*index_reader = new_index_reader; *index_reader = new_index_reader;
// Get prefixes block // Get prefixes block
@ -504,7 +519,9 @@ class HashIndexReader : public IndexReader {
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
bool total_order_seek = true, bool total_order_seek = true,
bool /*dont_care*/ = true) override { bool /*dont_care*/ = true) override {
return index_block_->NewIterator(icomparator_, iter, total_order_seek); return index_block_->NewIterator(
icomparator_, icomparator_->user_comparator(), iter, total_order_seek,
nullptr, index_key_includes_seq_);
} }
virtual size_t size() const override { return index_block_->size(); } virtual size_t size() const override { return index_block_->size(); }
@ -520,8 +537,11 @@ class HashIndexReader : public IndexReader {
private: private:
HashIndexReader(const InternalKeyComparator* icomparator, HashIndexReader(const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block, Statistics* stats) std::unique_ptr<Block>&& index_block, Statistics* stats,
: IndexReader(icomparator, stats), index_block_(std::move(index_block)) { const bool index_key_includes_seq)
: IndexReader(icomparator, stats),
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
assert(index_block_ != nullptr); assert(index_block_ != nullptr);
} }
@ -530,6 +550,7 @@ class HashIndexReader : public IndexReader {
std::unique_ptr<Block> index_block_; std::unique_ptr<Block> index_block_;
BlockContents prefixes_contents_; BlockContents prefixes_contents_;
const bool index_key_includes_seq_;
}; };
// Helper function to setup the cache key's prefix for the Table. // Helper function to setup the cache key's prefix for the Table.
@ -1026,7 +1047,8 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
*meta_block = std::move(meta); *meta_block = std::move(meta);
// meta block uses bytewise comparator. // meta block uses bytewise comparator.
iter->reset(meta_block->get()->NewIterator(BytewiseComparator())); iter->reset(meta_block->get()->NewIterator(BytewiseComparator(),
BytewiseComparator()));
return Status::OK(); return Status::OK();
} }
@ -1502,14 +1524,15 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
BlockIter* BlockBasedTable::NewDataBlockIterator( BlockIter* BlockBasedTable::NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const Slice& index_value, Rep* rep, const ReadOptions& ro, const Slice& index_value,
BlockIter* input_iter, bool is_index, GetContext* get_context) { BlockIter* input_iter, bool is_index, bool key_includes_seq,
GetContext* get_context) {
BlockHandle handle; BlockHandle handle;
Slice input = index_value; Slice input = index_value;
// We intentionally allow extra stuff in index_value so that we // We intentionally allow extra stuff in index_value so that we
// can add more features in the future. // can add more features in the future.
Status s = handle.DecodeFrom(&input); Status s = handle.DecodeFrom(&input);
return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, return NewDataBlockIterator(rep, ro, handle, input_iter, is_index,
get_context, s); key_includes_seq, get_context, s);
} }
// Convert an index iterator value (i.e., an encoded BlockHandle) // Convert an index iterator value (i.e., an encoded BlockHandle)
@ -1518,7 +1541,8 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
// If input_iter is not null, update this iter and return it // If input_iter is not null, update this iter and return it
BlockIter* BlockBasedTable::NewDataBlockIterator( BlockIter* BlockBasedTable::NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const BlockHandle& handle, Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
BlockIter* input_iter, bool is_index, GetContext* get_context, Status s) { BlockIter* input_iter, bool is_index, bool key_includes_seq,
GetContext* get_context, Status s) {
PERF_TIMER_GUARD(new_table_block_iter_nanos); PERF_TIMER_GUARD(new_table_block_iter_nanos);
const bool no_io = (ro.read_tier == kBlockCacheTier); const bool no_io = (ro.read_tier == kBlockCacheTier);
@ -1564,8 +1588,9 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
if (s.ok()) { if (s.ok()) {
assert(block.value != nullptr); assert(block.value != nullptr);
iter = block.value->NewIterator(&rep->internal_comparator, iter, true, iter = block.value->NewIterator(
rep->ioptions.statistics); &rep->internal_comparator, rep->internal_comparator.user_comparator(),
iter, true, rep->ioptions.statistics, key_includes_seq);
if (block.cache_handle != nullptr) { if (block.cache_handle != nullptr) {
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
block.cache_handle); block.cache_handle);
@ -1677,8 +1702,11 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
BlockBasedTable* table, BlockBasedTable* table,
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map) std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
: table_(table), block_map_(block_map) {} bool index_key_includes_seq)
: table_(table),
block_map_(block_map),
index_key_includes_seq_(index_key_includes_seq) {}
const size_t BlockBasedTableIterator::kMaxReadaheadSize = 256 * 1024; const size_t BlockBasedTableIterator::kMaxReadaheadSize = 256 * 1024;
@ -1701,8 +1729,9 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
assert(block_cache); assert(block_cache);
RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ, RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
block_cache->GetUsage(block->second.cache_handle)); block_cache->GetUsage(block->second.cache_handle));
return block->second.value->NewIterator(&rep->internal_comparator, nullptr, return block->second.value->NewIterator(
true, rep->ioptions.statistics); &rep->internal_comparator, rep->internal_comparator.user_comparator(),
nullptr, true, rep->ioptions.statistics, index_key_includes_seq_);
} }
// Create an empty iterator // Create an empty iterator
return new BlockIter(); return new BlockIter();
@ -1770,7 +1799,9 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key,
// and we're not really sure that we're past the end // and we're not really sure that we're past the end
// of the file // of the file
may_match = iiter->status().IsIncomplete(); may_match = iiter->status().IsIncomplete();
} else if (ExtractUserKey(iiter->key()) } else if ((rep_->table_properties->index_key_is_user_key
? iiter->key()
: ExtractUserKey(iiter->key()))
.starts_with(ExtractUserKey(internal_prefix))) { .starts_with(ExtractUserKey(internal_prefix))) {
// we need to check for this subtle case because our only // we need to check for this subtle case because our only
// guarantee is that "the key is a string >= last key in that data // guarantee is that "the key is a string >= last key in that data
@ -1836,7 +1867,11 @@ void BlockBasedTableIterator::Seek(const Slice& target) {
FindKeyForward(); FindKeyForward();
assert(!data_block_iter_.Valid() || assert(!data_block_iter_.Valid() ||
icomp_.Compare(target, data_block_iter_.key()) <= 0); (key_includes_seq_ &&
icomp_.Compare(target, data_block_iter_.key()) <= 0) ||
(!key_includes_seq_ &&
icomp_.user_comparator()->Compare(ExtractUserKey(target),
data_block_iter_.key()) <= 0));
} }
void BlockBasedTableIterator::SeekForPrev(const Slice& target) { void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
@ -1952,7 +1987,8 @@ void BlockBasedTableIterator::InitDataBlock() {
} }
BlockBasedTable::NewDataBlockIterator(rep, read_options_, data_block_handle, BlockBasedTable::NewDataBlockIterator(rep, read_options_, data_block_handle,
&data_block_iter_, false, &data_block_iter_, is_index_,
key_includes_seq_,
/* get_context */ nullptr, s); /* get_context */ nullptr, s);
block_iter_points_to_real_block_ = true; block_iter_points_to_real_block_ = true;
} }
@ -2024,6 +2060,7 @@ InternalIterator* BlockBasedTable::NewIterator(
Arena* arena, bool skip_filters) { Arena* arena, bool skip_filters) {
bool prefix_extractor_changed = bool prefix_extractor_changed =
PrefixExtractorChanged(rep_->table_properties, prefix_extractor); PrefixExtractorChanged(rep_->table_properties, prefix_extractor);
const bool kIsNotIndex = false;
if (arena == nullptr) { if (arena == nullptr) {
return new BlockBasedTableIterator( return new BlockBasedTableIterator(
this, read_options, rep_->internal_comparator, this, read_options, rep_->internal_comparator,
@ -2033,7 +2070,7 @@ InternalIterator* BlockBasedTable::NewIterator(
rep_->index_type == BlockBasedTableOptions::kHashSearch), rep_->index_type == BlockBasedTableOptions::kHashSearch),
!skip_filters && !read_options.total_order_seek && !skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr && !prefix_extractor_changed, prefix_extractor != nullptr && !prefix_extractor_changed,
prefix_extractor); prefix_extractor, kIsNotIndex);
} else { } else {
auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
return new (mem) BlockBasedTableIterator( return new (mem) BlockBasedTableIterator(
@ -2041,7 +2078,7 @@ InternalIterator* BlockBasedTable::NewIterator(
NewIndexIterator(read_options, prefix_extractor_changed), NewIndexIterator(read_options, prefix_extractor_changed),
!skip_filters && !read_options.total_order_seek && !skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr && !prefix_extractor_changed, prefix_extractor != nullptr && !prefix_extractor_changed,
prefix_extractor); prefix_extractor, kIsNotIndex);
} }
} }
@ -2061,7 +2098,8 @@ InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
assert(block_cache != nullptr); assert(block_cache != nullptr);
if (block_cache->Ref(rep_->range_del_entry.cache_handle)) { if (block_cache->Ref(rep_->range_del_entry.cache_handle)) {
auto iter = rep_->range_del_entry.value->NewIterator( auto iter = rep_->range_del_entry.value->NewIterator(
&rep_->internal_comparator, nullptr /* iter */, &rep_->internal_comparator,
rep_->internal_comparator.user_comparator(), nullptr /* iter */,
true /* total_order_seek */, rep_->ioptions.statistics); true /* total_order_seek */, rep_->ioptions.statistics);
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
rep_->range_del_entry.cache_handle); rep_->range_del_entry.cache_handle);
@ -2107,6 +2145,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
GetContext* get_context, GetContext* get_context,
const SliceTransform* prefix_extractor, const SliceTransform* prefix_extractor,
bool skip_filters) { bool skip_filters) {
assert(key.size() >= 8); // key must be internal key
Status s; Status s;
const bool no_io = read_options.read_tier == kBlockCacheTier; const bool no_io = read_options.read_tier == kBlockCacheTier;
CachableEntry<FilterBlockReader> filter_entry; CachableEntry<FilterBlockReader> filter_entry;
@ -2215,6 +2254,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
Status BlockBasedTable::Prefetch(const Slice* const begin, Status BlockBasedTable::Prefetch(const Slice* const begin,
const Slice* const end) { const Slice* const end) {
auto& comparator = rep_->internal_comparator; auto& comparator = rep_->internal_comparator;
auto user_comparator = comparator.user_comparator();
// pre-condition // pre-condition
if (begin && end && comparator.Compare(*begin, *end) > 0) { if (begin && end && comparator.Compare(*begin, *end) > 0) {
return Status::InvalidArgument(*begin, *end); return Status::InvalidArgument(*begin, *end);
@ -2238,8 +2278,11 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
iiter->Next()) { iiter->Next()) {
Slice block_handle = iiter->value(); Slice block_handle = iiter->value();
const bool is_user_key = rep_->table_properties->index_key_is_user_key > 0;
if (end && comparator.Compare(iiter->key(), *end) >= 0) { if (end &&
((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
(is_user_key &&
user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
if (prefetching_boundary_page) { if (prefetching_boundary_page) {
break; break;
} }
@ -2392,12 +2435,14 @@ Status BlockBasedTable::CreateIndexReader(
return PartitionIndexReader::Create( return PartitionIndexReader::Create(
this, file, prefetch_buffer, footer, footer.index_handle(), this, file, prefetch_buffer, footer, footer.index_handle(),
rep_->ioptions, icomparator, index_reader, rep_->ioptions, icomparator, index_reader,
rep_->persistent_cache_options, level); rep_->persistent_cache_options, level,
rep_->table_properties->index_key_is_user_key == 0);
} }
case BlockBasedTableOptions::kBinarySearch: { case BlockBasedTableOptions::kBinarySearch: {
return BinarySearchIndexReader::Create( return BinarySearchIndexReader::Create(
file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
icomparator, index_reader, rep_->persistent_cache_options); icomparator, index_reader, rep_->persistent_cache_options,
rep_->table_properties->index_key_is_user_key == 0);
} }
case BlockBasedTableOptions::kHashSearch: { case BlockBasedTableOptions::kHashSearch: {
std::unique_ptr<Block> meta_guard; std::unique_ptr<Block> meta_guard;
@ -2415,7 +2460,8 @@ Status BlockBasedTable::CreateIndexReader(
return BinarySearchIndexReader::Create( return BinarySearchIndexReader::Create(
file, prefetch_buffer, footer, footer.index_handle(), file, prefetch_buffer, footer, footer.index_handle(),
rep_->ioptions, icomparator, index_reader, rep_->ioptions, icomparator, index_reader,
rep_->persistent_cache_options); rep_->persistent_cache_options,
rep_->table_properties->index_key_is_user_key == 0);
} }
meta_index_iter = meta_iter_guard.get(); meta_index_iter = meta_iter_guard.get();
} }
@ -2424,7 +2470,8 @@ Status BlockBasedTable::CreateIndexReader(
rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer, rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer,
rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter, rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter,
index_reader, rep_->hash_index_allow_collision, index_reader, rep_->hash_index_allow_collision,
rep_->persistent_cache_options); rep_->persistent_cache_options,
rep_->table_properties->index_key_is_user_key == 0);
} }
default: { default: {
std::string error_message = std::string error_message =
@ -2709,16 +2756,22 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
break; break;
} }
Slice key = blockhandles_iter->key(); Slice key = blockhandles_iter->key();
Slice user_key;
InternalKey ikey; InternalKey ikey;
if (rep_->table_properties->index_key_is_user_key == 0) {
ikey.DecodeFrom(key); ikey.DecodeFrom(key);
user_key = ikey.user_key();
} else {
user_key = key;
}
out_file->Append(" HEX "); out_file->Append(" HEX ");
out_file->Append(ikey.user_key().ToString(true).c_str()); out_file->Append(user_key.ToString(true).c_str());
out_file->Append(": "); out_file->Append(": ");
out_file->Append(blockhandles_iter->value().ToString(true).c_str()); out_file->Append(blockhandles_iter->value().ToString(true).c_str());
out_file->Append("\n"); out_file->Append("\n");
std::string str_key = ikey.user_key().ToString(); std::string str_key = user_key.ToString();
std::string res_key(""); std::string res_key("");
char cspace = ' '; char cspace = ' ';
for (size_t i = 0; i < str_key.size(); i++) { for (size_t i = 0; i < str_key.size(); i++) {

@ -217,11 +217,13 @@ class BlockBasedTable : public TableReader {
const Slice& index_value, const Slice& index_value,
BlockIter* input_iter = nullptr, BlockIter* input_iter = nullptr,
bool is_index = false, bool is_index = false,
bool key_includes_seq = true,
GetContext* get_context = nullptr); GetContext* get_context = nullptr);
static BlockIter* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, static BlockIter* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
const BlockHandle& block_hanlde, const BlockHandle& block_hanlde,
BlockIter* input_iter = nullptr, BlockIter* input_iter = nullptr,
bool is_index = false, bool is_index = false,
bool key_includes_seq = true,
GetContext* get_context = nullptr, GetContext* get_context = nullptr,
Status s = Status()); Status s = Status());
@ -378,13 +380,15 @@ class BlockBasedTable::PartitionedIndexIteratorState
public: public:
PartitionedIndexIteratorState( PartitionedIndexIteratorState(
BlockBasedTable* table, BlockBasedTable* table,
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map = nullptr); std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
const bool index_key_includes_seq);
InternalIterator* NewSecondaryIterator(const Slice& index_value) override; InternalIterator* NewSecondaryIterator(const Slice& index_value) override;
private: private:
// Don't own table_ // Don't own table_
BlockBasedTable* table_; BlockBasedTable* table_;
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_; std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
bool index_key_includes_seq_;
}; };
// CachableEntry represents the entries that *may* be fetched from block cache. // CachableEntry represents the entries that *may* be fetched from block cache.
@ -509,7 +513,8 @@ class BlockBasedTableIterator : public InternalIterator {
const ReadOptions& read_options, const ReadOptions& read_options,
const InternalKeyComparator& icomp, const InternalKeyComparator& icomp,
InternalIterator* index_iter, bool check_filter, InternalIterator* index_iter, bool check_filter,
const SliceTransform* prefix_extractor) const SliceTransform* prefix_extractor, bool is_index,
bool key_includes_seq = true)
: table_(table), : table_(table),
read_options_(read_options), read_options_(read_options),
icomp_(icomp), icomp_(icomp),
@ -517,6 +522,8 @@ class BlockBasedTableIterator : public InternalIterator {
pinned_iters_mgr_(nullptr), pinned_iters_mgr_(nullptr),
block_iter_points_to_real_block_(false), block_iter_points_to_real_block_(false),
check_filter_(check_filter), check_filter_(check_filter),
is_index_(is_index),
key_includes_seq_(key_includes_seq),
prefix_extractor_(prefix_extractor) {} prefix_extractor_(prefix_extractor) {}
~BlockBasedTableIterator() { delete index_iter_; } ~BlockBasedTableIterator() { delete index_iter_; }
@ -609,6 +616,10 @@ class BlockBasedTableIterator : public InternalIterator {
bool block_iter_points_to_real_block_; bool block_iter_points_to_real_block_;
bool is_out_of_bound_ = false; bool is_out_of_bound_ = false;
bool check_filter_; bool check_filter_;
// If the blocks over which we iterate are index blocks
bool is_index_;
// If the keys in the blocks over which we iterate include 8 byte sequence
bool key_includes_seq_;
// TODO use block offset instead // TODO use block offset instead
std::string prev_index_value_; std::string prev_index_value_;
const SliceTransform* prefix_extractor_; const SliceTransform* prefix_extractor_;

@ -99,7 +99,8 @@ TEST_F(BlockTest, SimpleTest) {
// read contents of block sequentially // read contents of block sequentially
int count = 0; int count = 0;
InternalIterator *iter = reader.NewIterator(options.comparator); InternalIterator *iter =
reader.NewIterator(options.comparator, options.comparator);
for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) { for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) {
// read kv from block // read kv from block
@ -113,7 +114,7 @@ TEST_F(BlockTest, SimpleTest) {
delete iter; delete iter;
// read block contents randomly // read block contents randomly
iter = reader.NewIterator(options.comparator); iter = reader.NewIterator(options.comparator, options.comparator);
for (int i = 0; i < num_records; i++) { for (int i = 0; i < num_records; i++) {
// find a random key in the lookaside array // find a random key in the lookaside array
@ -163,7 +164,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
NewFixedPrefixTransform(prefix_size)); NewFixedPrefixTransform(prefix_size));
std::unique_ptr<InternalIterator> regular_iter( std::unique_ptr<InternalIterator> regular_iter(
reader2.NewIterator(BytewiseComparator())); reader2.NewIterator(BytewiseComparator(), BytewiseComparator()));
// Seek existent keys // Seek existent keys
for (size_t i = 0; i < keys.size(); i++) { for (size_t i = 0; i < keys.size(); i++) {
@ -388,8 +389,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
// read contents of block sequentially // read contents of block sequentially
size_t read_bytes = 0; size_t read_bytes = 0;
BlockIter *iter = static_cast<BlockIter *>( BlockIter *iter = static_cast<BlockIter *>(reader.NewIterator(
reader.NewIterator(options.comparator, nullptr, true, stats.get())); options.comparator, options.comparator, nullptr, true, stats.get()));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
iter->value(); iter->value();
read_bytes += iter->TEST_CurrentEntrySize(); read_bytes += iter->TEST_CurrentEntrySize();
@ -421,8 +422,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
kBytesPerBit, stats.get()); kBytesPerBit, stats.get());
size_t read_bytes = 0; size_t read_bytes = 0;
BlockIter *iter = static_cast<BlockIter *>( BlockIter *iter = static_cast<BlockIter *>(reader.NewIterator(
reader.NewIterator(options.comparator, nullptr, true, stats.get())); options.comparator, options.comparator, nullptr, true, stats.get()));
for (int i = 0; i < num_records; i++) { for (int i = 0; i < num_records; i++) {
Slice k(keys[i]); Slice k(keys[i]);
@ -457,8 +458,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
kBytesPerBit, stats.get()); kBytesPerBit, stats.get());
size_t read_bytes = 0; size_t read_bytes = 0;
BlockIter *iter = static_cast<BlockIter *>( BlockIter *iter = static_cast<BlockIter *>(reader.NewIterator(
reader.NewIterator(options.comparator, nullptr, true, stats.get())); options.comparator, options.comparator, nullptr, true, stats.get()));
std::unordered_set<int> read_keys; std::unordered_set<int> read_keys;
for (int i = 0; i < num_records; i++) { for (int i = 0; i < num_records; i++) {
int index = rnd.Uniform(num_records); int index = rnd.Uniform(num_records);

@ -32,12 +32,14 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
switch (index_type) { switch (index_type) {
case BlockBasedTableOptions::kBinarySearch: { case BlockBasedTableOptions::kBinarySearch: {
result = new ShortenedIndexBuilder(comparator, result = new ShortenedIndexBuilder(comparator,
table_opt.index_block_restart_interval); table_opt.index_block_restart_interval,
table_opt.format_version);
} }
break; break;
case BlockBasedTableOptions::kHashSearch: { case BlockBasedTableOptions::kHashSearch: {
result = new HashIndexBuilder(comparator, int_key_slice_transform, result = new HashIndexBuilder(comparator, int_key_slice_transform,
table_opt.index_block_restart_interval); table_opt.index_block_restart_interval,
table_opt.format_version);
} }
break; break;
case BlockBasedTableOptions::kTwoLevelIndexSearch: { case BlockBasedTableOptions::kTwoLevelIndexSearch: {
@ -62,9 +64,11 @@ PartitionedIndexBuilder::PartitionedIndexBuilder(
const InternalKeyComparator* comparator, const InternalKeyComparator* comparator,
const BlockBasedTableOptions& table_opt) const BlockBasedTableOptions& table_opt)
: IndexBuilder(comparator), : IndexBuilder(comparator),
index_block_builder_(table_opt.index_block_restart_interval), index_block_builder_(table_opt.index_block_restart_interval,
table_opt.format_version),
sub_index_builder_(nullptr), sub_index_builder_(nullptr),
table_opt_(table_opt) {} table_opt_(table_opt),
seperator_is_key_plus_seq_(false) {}
PartitionedIndexBuilder::~PartitionedIndexBuilder() { PartitionedIndexBuilder::~PartitionedIndexBuilder() {
delete sub_index_builder_; delete sub_index_builder_;
@ -73,7 +77,8 @@ PartitionedIndexBuilder::~PartitionedIndexBuilder() {
void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
assert(sub_index_builder_ == nullptr); assert(sub_index_builder_ == nullptr);
sub_index_builder_ = new ShortenedIndexBuilder( sub_index_builder_ = new ShortenedIndexBuilder(
comparator_, table_opt_.index_block_restart_interval); comparator_, table_opt_.index_block_restart_interval,
table_opt_.format_version);
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
table_opt_.metadata_block_size, table_opt_.block_size_deviation, table_opt_.metadata_block_size, table_opt_.block_size_deviation,
sub_index_builder_->index_block_builder_)); sub_index_builder_->index_block_builder_));
@ -95,6 +100,10 @@ void PartitionedIndexBuilder::AddIndexEntry(
} }
sub_index_builder_->AddIndexEntry(last_key_in_current_block, sub_index_builder_->AddIndexEntry(last_key_in_current_block,
first_key_in_next_block, block_handle); first_key_in_next_block, block_handle);
if (sub_index_builder_->seperator_is_key_plus_seq_) {
// then we need to apply it to all sub-index builders
seperator_is_key_plus_seq_ = true;
}
sub_index_last_key_ = std::string(*last_key_in_current_block); sub_index_last_key_ = std::string(*last_key_in_current_block);
entries_.push_back( entries_.push_back(
{sub_index_last_key_, {sub_index_last_key_,
@ -123,6 +132,10 @@ void PartitionedIndexBuilder::AddIndexEntry(
sub_index_builder_->AddIndexEntry(last_key_in_current_block, sub_index_builder_->AddIndexEntry(last_key_in_current_block,
first_key_in_next_block, block_handle); first_key_in_next_block, block_handle);
sub_index_last_key_ = std::string(*last_key_in_current_block); sub_index_last_key_ = std::string(*last_key_in_current_block);
if (sub_index_builder_->seperator_is_key_plus_seq_) {
// then we need to apply it to all sub-index builders
seperator_is_key_plus_seq_ = true;
}
} }
} }
@ -146,6 +159,8 @@ Status PartitionedIndexBuilder::Finish(
// Finish the next partition index in line and Incomplete() to indicate we // Finish the next partition index in line and Incomplete() to indicate we
// expect more calls to Finish // expect more calls to Finish
Entry& entry = entries_.front(); Entry& entry = entries_.front();
// Apply the policy to all sub-indexes
entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
auto s = entry.value->Finish(index_blocks); auto s = entry.value->Finish(index_blocks);
finishing_indexes = true; finishing_indexes = true;
return s.ok() ? Status::Incomplete() : s; return s.ok() ? Status::Incomplete() : s;

@ -99,6 +99,8 @@ class IndexBuilder {
// Get the estimated size for index block. // Get the estimated size for index block.
virtual size_t EstimatedSize() const = 0; virtual size_t EstimatedSize() const = 0;
virtual bool seperator_is_key_plus_seq() { return true; }
protected: protected:
const InternalKeyComparator* comparator_; const InternalKeyComparator* comparator_;
}; };
@ -115,9 +117,14 @@ class IndexBuilder {
class ShortenedIndexBuilder : public IndexBuilder { class ShortenedIndexBuilder : public IndexBuilder {
public: public:
explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
int index_block_restart_interval) int index_block_restart_interval,
uint32_t format_version)
: IndexBuilder(comparator), : IndexBuilder(comparator),
index_block_builder_(index_block_restart_interval) {} index_block_builder_(index_block_restart_interval),
index_block_builder_without_seq_(index_block_restart_interval) {
// Making the default true will disable the feature for old versions
seperator_is_key_plus_seq_ = (format_version <= 2);
}
virtual void AddIndexEntry(std::string* last_key_in_current_block, virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block, const Slice* first_key_in_next_block,
@ -125,31 +132,57 @@ class ShortenedIndexBuilder : public IndexBuilder {
if (first_key_in_next_block != nullptr) { if (first_key_in_next_block != nullptr) {
comparator_->FindShortestSeparator(last_key_in_current_block, comparator_->FindShortestSeparator(last_key_in_current_block,
*first_key_in_next_block); *first_key_in_next_block);
if (!seperator_is_key_plus_seq_ &&
comparator_->user_comparator()->Compare(
ExtractUserKey(*last_key_in_current_block),
ExtractUserKey(*first_key_in_next_block)) == 0) {
seperator_is_key_plus_seq_ = true;
}
} else { } else {
comparator_->FindShortSuccessor(last_key_in_current_block); comparator_->FindShortSuccessor(last_key_in_current_block);
} }
auto sep = Slice(*last_key_in_current_block);
std::string handle_encoding; std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding); block_handle.EncodeTo(&handle_encoding);
index_block_builder_.Add(*last_key_in_current_block, handle_encoding); index_block_builder_.Add(sep, handle_encoding);
if (!seperator_is_key_plus_seq_) {
index_block_builder_without_seq_.Add(ExtractUserKey(sep),
handle_encoding);
}
} }
using IndexBuilder::Finish; using IndexBuilder::Finish;
virtual Status Finish( virtual Status Finish(
IndexBlocks* index_blocks, IndexBlocks* index_blocks,
const BlockHandle& /*last_partition_block_handle*/) override { const BlockHandle& /*last_partition_block_handle*/) override {
if (seperator_is_key_plus_seq_) {
index_blocks->index_block_contents = index_block_builder_.Finish(); index_blocks->index_block_contents = index_block_builder_.Finish();
} else {
index_blocks->index_block_contents =
index_block_builder_without_seq_.Finish();
}
return Status::OK(); return Status::OK();
} }
virtual size_t EstimatedSize() const override { virtual size_t EstimatedSize() const override {
if (seperator_is_key_plus_seq_) {
return index_block_builder_.CurrentSizeEstimate(); return index_block_builder_.CurrentSizeEstimate();
} else {
return index_block_builder_without_seq_.CurrentSizeEstimate();
}
}
virtual bool seperator_is_key_plus_seq() override {
return seperator_is_key_plus_seq_;
} }
friend class PartitionedIndexBuilder; friend class PartitionedIndexBuilder;
private: private:
BlockBuilder index_block_builder_; BlockBuilder index_block_builder_;
BlockBuilder index_block_builder_without_seq_;
bool seperator_is_key_plus_seq_;
}; };
// HashIndexBuilder contains a binary-searchable primary index and the // HashIndexBuilder contains a binary-searchable primary index and the
@ -183,9 +216,11 @@ class HashIndexBuilder : public IndexBuilder {
public: public:
explicit HashIndexBuilder(const InternalKeyComparator* comparator, explicit HashIndexBuilder(const InternalKeyComparator* comparator,
const SliceTransform* hash_key_extractor, const SliceTransform* hash_key_extractor,
int index_block_restart_interval) int index_block_restart_interval,
int format_version)
: IndexBuilder(comparator), : IndexBuilder(comparator),
primary_index_builder_(comparator, index_block_restart_interval), primary_index_builder_(comparator, index_block_restart_interval,
format_version),
hash_key_extractor_(hash_key_extractor) {} hash_key_extractor_(hash_key_extractor) {}
virtual void AddIndexEntry(std::string* last_key_in_current_block, virtual void AddIndexEntry(std::string* last_key_in_current_block,
@ -240,6 +275,10 @@ class HashIndexBuilder : public IndexBuilder {
prefix_meta_block_.size(); prefix_meta_block_.size();
} }
virtual bool seperator_is_key_plus_seq() override {
return primary_index_builder_.seperator_is_key_plus_seq();
}
private: private:
void FlushPendingPrefix() { void FlushPendingPrefix() {
prefix_block_.append(pending_entry_prefix_.data(), prefix_block_.append(pending_entry_prefix_.data(),
@ -316,6 +355,10 @@ class PartitionedIndexBuilder : public IndexBuilder {
// cutting the next partition // cutting the next partition
void RequestPartitionCut(); void RequestPartitionCut();
virtual bool seperator_is_key_plus_seq() override {
return seperator_is_key_plus_seq_;
}
private: private:
void MakeNewSubIndexBuilder(); void MakeNewSubIndexBuilder();
@ -333,6 +376,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
// true if Finish is called once but not complete yet. // true if Finish is called once but not complete yet.
bool finishing_indexes = false; bool finishing_indexes = false;
const BlockBasedTableOptions& table_opt_; const BlockBasedTableOptions& table_opt_;
bool seperator_is_key_plus_seq_;
// true if an external entity (such as filter partition builder) request // true if an external entity (such as filter partition builder) request
// cutting the next partition // cutting the next partition
bool partition_cut_requested_ = true; bool partition_cut_requested_ = true;

@ -71,6 +71,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
Add(TablePropertiesNames::kIndexPartitions, props.index_partitions); Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size); Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
} }
Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
Add(TablePropertiesNames::kNumEntries, props.num_entries); Add(TablePropertiesNames::kNumEntries, props.num_entries);
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
Add(TablePropertiesNames::kFilterSize, props.filter_size); Add(TablePropertiesNames::kFilterSize, props.filter_size);
@ -192,7 +193,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
Block properties_block(std::move(block_contents), Block properties_block(std::move(block_contents),
kDisableGlobalSequenceNumber); kDisableGlobalSequenceNumber);
BlockIter iter; BlockIter iter;
properties_block.NewIterator(BytewiseComparator(), &iter); properties_block.NewIterator(BytewiseComparator(), BytewiseComparator(),
&iter);
auto new_table_properties = new TableProperties(); auto new_table_properties = new TableProperties();
// All pre-defined properties of type uint64_t // All pre-defined properties of type uint64_t
@ -203,6 +205,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
&new_table_properties->index_partitions}, &new_table_properties->index_partitions},
{TablePropertiesNames::kTopLevelIndexSize, {TablePropertiesNames::kTopLevelIndexSize,
&new_table_properties->top_level_index_size}, &new_table_properties->top_level_index_size},
{TablePropertiesNames::kIndexKeyIsUserKey,
&new_table_properties->index_key_is_user_key},
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
{TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
{TablePropertiesNames::kRawValueSize, {TablePropertiesNames::kRawValueSize,
@ -312,7 +316,7 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
Block metaindex_block(std::move(metaindex_contents), Block metaindex_block(std::move(metaindex_contents),
kDisableGlobalSequenceNumber); kDisableGlobalSequenceNumber);
std::unique_ptr<InternalIterator> meta_iter( std::unique_ptr<InternalIterator> meta_iter(
metaindex_block.NewIterator(BytewiseComparator())); metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator()));
// -- Read property block // -- Read property block
bool found_properties_block = true; bool found_properties_block = true;
@ -375,7 +379,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
kDisableGlobalSequenceNumber); kDisableGlobalSequenceNumber);
std::unique_ptr<InternalIterator> meta_iter; std::unique_ptr<InternalIterator> meta_iter;
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); meta_iter.reset(
metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator()));
return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
} }
@ -416,7 +421,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
kDisableGlobalSequenceNumber); kDisableGlobalSequenceNumber);
std::unique_ptr<InternalIterator> meta_iter; std::unique_ptr<InternalIterator> meta_iter;
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); meta_iter.reset(
metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator()));
BlockHandle block_handle; BlockHandle block_handle;
status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);

@ -113,7 +113,7 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
BlockIter biter; BlockIter biter;
BlockHandle handle; BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true); idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true);
biter.SeekToFirst(); biter.SeekToFirst();
for (; biter.Valid(); biter.Next()) { for (; biter.Valid(); biter.Next()) {
auto input = biter.value(); auto input = biter.value();
@ -207,7 +207,7 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
Slice PartitionedFilterBlockReader::GetFilterPartitionHandle( Slice PartitionedFilterBlockReader::GetFilterPartitionHandle(
const Slice& entry) { const Slice& entry) {
BlockIter iter; BlockIter iter;
idx_on_fltr_blk_->NewIterator(&comparator_, &iter, true); idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &iter, true);
iter.Seek(entry); iter.Seek(entry);
if (UNLIKELY(!iter.Valid())) { if (UNLIKELY(!iter.Valid())) {
return Slice(); return Slice();
@ -269,7 +269,7 @@ void PartitionedFilterBlockReader::CacheDependencies(
auto rep = table_->rep_; auto rep = table_->rep_;
BlockIter biter; BlockIter biter;
BlockHandle handle; BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true); idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true);
// Index partitions are assumed to be consecuitive. Prefetch them all. // Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset // Read the first block offset
biter.SeekToFirst(); biter.SeekToFirst();

@ -90,7 +90,12 @@ std::string TableProperties::ToString(
prop_delim, kv_delim); prop_delim, kv_delim);
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); char index_block_size_str[80];
snprintf(index_block_size_str, sizeof(index_block_size_str),
"index block size (user-key? %d)",
static_cast<int>(index_key_is_user_key));
AppendProperty(result, index_block_size_str, index_size, prop_delim,
kv_delim);
if (index_partitions != 0) { if (index_partitions != 0) {
AppendProperty(result, "# index partitions", index_partitions, prop_delim, AppendProperty(result, "# index partitions", index_partitions, prop_delim,
kv_delim); kv_delim);
@ -155,6 +160,7 @@ void TableProperties::Add(const TableProperties& tp) {
index_size += tp.index_size; index_size += tp.index_size;
index_partitions += tp.index_partitions; index_partitions += tp.index_partitions;
top_level_index_size += tp.top_level_index_size; top_level_index_size += tp.top_level_index_size;
index_key_is_user_key += tp.index_key_is_user_key;
filter_size += tp.filter_size; filter_size += tp.filter_size;
raw_key_size += tp.raw_key_size; raw_key_size += tp.raw_key_size;
raw_value_size += tp.raw_value_size; raw_value_size += tp.raw_value_size;
@ -170,6 +176,8 @@ const std::string TablePropertiesNames::kIndexPartitions =
"rocksdb.index.partitions"; "rocksdb.index.partitions";
const std::string TablePropertiesNames::kTopLevelIndexSize = const std::string TablePropertiesNames::kTopLevelIndexSize =
"rocksdb.top-level.index.size"; "rocksdb.top-level.index.size";
const std::string TablePropertiesNames::kIndexKeyIsUserKey =
"rocksdb.index.key.is.user.key";
const std::string TablePropertiesNames::kFilterSize = const std::string TablePropertiesNames::kFilterSize =
"rocksdb.filter.size"; "rocksdb.filter.size";
const std::string TablePropertiesNames::kRawKeySize = const std::string TablePropertiesNames::kRawKeySize =

@ -237,7 +237,7 @@ class BlockConstructor: public Constructor {
} }
virtual InternalIterator* NewIterator( virtual InternalIterator* NewIterator(
const SliceTransform* /*prefix_extractor*/) const override { const SliceTransform* /*prefix_extractor*/) const override {
return block_->NewIterator(comparator_); return block_->NewIterator(comparator_, comparator_);
} }
private: private:
@ -2115,7 +2115,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
GetContext::kNotFound, user_key, &value, nullptr, GetContext::kNotFound, user_key, &value, nullptr,
nullptr, nullptr, nullptr); nullptr, nullptr, nullptr);
ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context, ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
moptions4.prefix_extractor.get())); moptions4.prefix_extractor.get()));
ASSERT_STREQ(value.data(), "hello"); ASSERT_STREQ(value.data(), "hello");
BlockCachePropertiesSnapshot props(options.statistics.get()); BlockCachePropertiesSnapshot props(options.statistics.get());
@ -2427,7 +2427,8 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
ASSERT_OK(c.Reopen(ioptions1, moptions1)); ASSERT_OK(c.Reopen(ioptions1, moptions1));
auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
for (const std::string& key : keys) { for (const std::string& key : keys) {
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
} }
c.ResetTableReader(); c.ResetTableReader();
@ -2439,7 +2440,8 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
ASSERT_OK(c.Reopen(ioptions2, moptions2)); ASSERT_OK(c.Reopen(ioptions2, moptions2));
table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
for (const std::string& key : keys) { for (const std::string& key : keys) {
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
} }
c.ResetTableReader(); c.ResetTableReader();
} }

@ -87,13 +87,6 @@ class PlainInternalKeyComparator : public InternalKeyComparator {
virtual int Compare(const Slice& a, const Slice& b) const override { virtual int Compare(const Slice& a, const Slice& b) const override {
return user_comparator()->Compare(a, b); return user_comparator()->Compare(a, b);
} }
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const override {
user_comparator()->FindShortestSeparator(start, limit);
}
virtual void FindShortSuccessor(std::string* key) const override {
user_comparator()->FindShortSuccessor(key);
}
}; };
#endif #endif

Loading…
Cancel
Save