Index value delta encoding (#3983)

Summary:
Given that index value is a BlockHandle, which is basically an <offset, size> pair we can apply delta encoding on the values. The first value at each index restart interval encoded the full BlockHandle but the rest encode only the size. Refer to IndexBlockIter::DecodeCurrentValue for the detail of the encoding. This reduces the index size which helps using the  block cache more efficiently. The feature is enabled with using format_version 4.

The feature comes with a bit of cpu overhead which should be paid back by the higher cache hits due to smaller index block size.
Results with sysbench read-only using 4k blocks and using 16 index restart interval:
Format 2:
19585   rocksdb read-only range=100
Format 3:
19569   rocksdb read-only range=100
Format 4:
19352   rocksdb read-only range=100
Pull Request resolved: https://github.com/facebook/rocksdb/pull/3983

Differential Revision: D8361343

Pulled By: maysamyabandeh

fbshipit-source-id: f882ee082322acac32b0072e2bdbb0b5f854e651
main
Maysam Yabandeh 7 years ago committed by Facebook Github Bot
parent 63f265c383
commit caf0f53a74
  1. 3
      HISTORY.md
  2. 1
      db/builder.h
  3. 2
      db/db_impl.cc
  4. 1
      db/db_iter.h
  5. 39
      db/db_properties_test.cc
  6. 4
      db/db_test_util.cc
  7. 2
      db/db_test_util.h
  8. 1
      db/memtable.h
  9. 1
      db/merge_helper.h
  10. 4
      db/table_cache.cc
  11. 1
      db/table_cache.h
  12. 4
      db/version_edit.cc
  13. 1
      db/version_set.h
  14. 3
      include/rocksdb/table_properties.h
  15. 185
      table/block.cc
  16. 90
      table/block.h
  17. 17
      table/block_based_table_builder.cc
  18. 341
      table/block_based_table_reader.cc
  19. 47
      table/block_based_table_reader.h
  20. 48
      table/block_builder.cc
  21. 9
      table/block_builder.h
  22. 101
      table/block_test.cc
  23. 5
      table/cuckoo_table_reader.cc
  24. 1
      table/cuckoo_table_reader.h
  25. 19
      table/format.cc
  26. 4
      table/format.h
  27. 48
      table/index_builder.cc
  28. 40
      table/index_builder.h
  29. 28
      table/internal_iterator.h
  30. 49
      table/iterator.cc
  31. 34
      table/iterator_wrapper.h
  32. 2
      table/merging_iterator.cc
  33. 4
      table/merging_iterator.h
  34. 4
      table/meta_blocks.cc
  35. 1
      table/meta_blocks.h
  36. 85
      table/partitioned_filter_block.cc
  37. 12
      table/partitioned_filter_block.h
  38. 12
      table/partitioned_filter_block_test.cc
  39. 1
      table/plain_table_reader.h
  40. 8
      table/table_properties.cc
  41. 1
      table/table_properties_internal.h
  42. 1
      table/table_reader.h
  43. 14
      table/table_test.cc
  44. 63
      table/two_level_iterator.cc
  45. 8
      table/two_level_iterator.h
  46. 24
      util/coding.h
  47. 2
      util/testutil.cc
  48. 2
      utilities/transactions/transaction_test.h

@ -2,6 +2,7 @@
## Unreleased
### Public API Change
### New Features
* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used.
### Bug Fixes
## 5.15.0 (7/17/2018)
@ -13,7 +14,7 @@
* The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries.
### New Features
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used.
* Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1.
* Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`.
* Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table.

@ -35,7 +35,6 @@ class VersionEdit;
class TableBuilder;
class WritableFileWriter;
class InternalStats;
class InternalIterator;
// @param column_family_name Name of the column family that is also identified
// by column_family_id, or empty string if unknown. It must outlive the

@ -1047,7 +1047,7 @@ InternalIterator* DBImpl::NewInternalIterator(
} else {
CleanupSuperVersion(super_version);
}
return NewErrorInternalIterator(s, arena);
return NewErrorInternalIterator<Slice>(s, arena);
}
ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {

@ -23,7 +23,6 @@ namespace rocksdb {
class Arena;
class DBIter;
class InternalIterator;
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number

@ -180,17 +180,16 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
ResetTableProperties(tp);
sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64
" # range deletions %" SCNu64
" raw key size %" SCNu64
" # range deletions %" SCNu64 " raw key size %" SCNu64
" raw average key size %lf "
" raw value size %" SCNu64
" raw average value size %lf "
" data block size %" SCNu64 " index block size (user-key? %" SCNu64
") %" SCNu64 " filter block size %" SCNu64,
", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions,
&tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double,
&tp->data_size, &tp->index_key_is_user_key, &tp->index_size,
&tp->filter_size);
&tp->data_size, &tp->index_key_is_user_key,
&tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size);
}
void VerifySimilar(uint64_t a, uint64_t b, double bias) {
@ -224,14 +223,11 @@ void VerifyTableProperties(const TableProperties& base_tp,
ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
}
void GetExpectedTableProperties(TableProperties* expected_tp,
const int kKeySize, const int kValueSize,
const int kKeysPerTable,
const int kRangeDeletionsPerTable,
const int kTableCount,
const int kBloomBitsPerKey,
const size_t kBlockSize,
const bool index_key_is_user_key) {
void GetExpectedTableProperties(
TableProperties* expected_tp, const int kKeySize, const int kValueSize,
const int kKeysPerTable, const int kRangeDeletionsPerTable,
const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
const bool index_key_is_user_key, const bool value_delta_encoding) {
const int kKeyCount = kTableCount * kKeysPerTable;
const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
const int kAvgSuccessorSize = kKeySize / 5;
@ -248,7 +244,9 @@ void GetExpectedTableProperties(TableProperties* expected_tp,
kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
expected_tp->index_size =
expected_tp->num_data_blocks *
(kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8));
(kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
// discount 1 byte as value size is not encoded in value delta encoding
(value_delta_encoding ? 1 : 0));
expected_tp->filter_size =
kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
}
@ -342,12 +340,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
TableProperties output_tp;
ParseTablePropertiesString(property, &output_tp);
bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
TableProperties expected_tp;
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kRangeDeletionsPerTable,
kTableCount, kBloomBitsPerKey,
table_options.block_size, index_key_is_user_key);
table_options.block_size, index_key_is_user_key,
value_is_delta_encoded);
VerifyTableProperties(expected_tp, output_tp);
}
@ -533,6 +533,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
ParseTablePropertiesString(tp_string, &tp);
bool index_key_is_user_key = tp.index_key_is_user_key > 0;
bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
ASSERT_EQ(sum_tp.data_size, tp.data_size);
ASSERT_EQ(sum_tp.index_size, tp.index_size);
ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
@ -542,10 +543,10 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
if (table > 3) {
GetExpectedTableProperties(
&expected_tp, kKeySize, kValueSize, kKeysPerTable,
kRangeDeletionsPerTable, table, kBloomBitsPerKey,
table_options.block_size, index_key_is_user_key);
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kRangeDeletionsPerTable, table,
kBloomBitsPerKey, table_options.block_size,
index_key_is_user_key, value_is_delta_encoded);
// Gives larger bias here as index block size, filter block size,
// and data block size become much harder to estimate in this test.
VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);

@ -449,8 +449,8 @@ Options DBTestBase::GetOptions(
options.prefix_extractor.reset(NewNoopTransform());
break;
}
case kBlockBasedTableWithPartitionedIndexFormat3: {
table_options.format_version = 3;
case kBlockBasedTableWithPartitionedIndexFormat4: {
table_options.format_version = 4;
// Format 3 changes the binary index format. Since partitioned index is a
// super-set of simple indexes, we are also using kTwoLevelIndexSearch to
// test this format.

@ -698,7 +698,7 @@ class DBTestBase : public testing::Test {
kLevelSubcompactions,
kBlockBasedTableWithIndexRestartInterval,
kBlockBasedTableWithPartitionedIndex,
kBlockBasedTableWithPartitionedIndexFormat3,
kBlockBasedTableWithPartitionedIndexFormat4,
kPartitionedFilterWithNewTableReaderForCompactions,
kUniversalSubcompactions,
// This must be the last line

@ -34,7 +34,6 @@ namespace rocksdb {
class Mutex;
class MemTableIterator;
class MergeContext;
class InternalIterator;
struct ImmutableMemTableOptions {
explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,

@ -26,7 +26,6 @@ class Iterator;
class Logger;
class MergeOperator;
class Statistics;
class InternalIterator;
class MergeHelper {
public:

@ -238,7 +238,7 @@ InternalIterator* TableCache::NewIterator(
if (s.ok()) {
if (options.table_filter &&
!options.table_filter(*table_reader->GetTableProperties())) {
result = NewEmptyInternalIterator(arena);
result = NewEmptyInternalIterator<Slice>(arena);
} else {
result = table_reader->NewIterator(options, prefix_extractor, arena,
skip_filters, for_compaction);
@ -279,7 +279,7 @@ InternalIterator* TableCache::NewIterator(
}
if (!s.ok()) {
assert(result == nullptr);
result = NewErrorInternalIterator(s, arena);
result = NewErrorInternalIterator<Slice>(s, arena);
}
return result;
}

@ -31,7 +31,6 @@ class Arena;
struct FileDescriptor;
class GetContext;
class HistogramImpl;
class InternalIterator;
class TableCache {
public:

@ -46,7 +46,7 @@ enum CustomTag : uint32_t {
kTerminate = 1, // The end of customized fields
kNeedCompaction = 2,
// Since Manifest is not entirely currently forward-compatible, and the only
// forward-compatbile part is the CutsomtTag of kNewFile, we currently encode
// forward-compatible part is the CutsomtTag of kNewFile, we currently encode
// kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
// removed when manifest becomes forward-comptabile.
kMinLogNumberToKeepHack = 3,
@ -274,7 +274,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
break;
case kMinLogNumberToKeepHack:
// This is a hack to encode kMinLogNumberToKeep in a
// forward-compatbile fashion.
// forward-compatible fashion.
if (!GetFixed64(&field, &min_log_number_to_keep_)) {
return "deleted log number malformatted";
}

@ -52,7 +52,6 @@ class Writer;
}
class Compaction;
class InternalIterator;
class LogBuffer;
class LookupKey;
class MemTable;

@ -34,6 +34,7 @@ struct TablePropertiesNames {
static const std::string kIndexPartitions;
static const std::string kTopLevelIndexSize;
static const std::string kIndexKeyIsUserKey;
static const std::string kIndexValueIsDeltaEncoded;
static const std::string kFilterSize;
static const std::string kRawKeySize;
static const std::string kRawValueSize;
@ -139,6 +140,8 @@ struct TableProperties {
// Whether the index key is user key. Otherwise it includes 8 byte of sequence
// number added by internal key format.
uint64_t index_key_is_user_key = 0;
// Whether delta encoding is used to encode the index values.
uint64_t index_value_is_delta_encoded = 0;
// the size of filter block.
uint64_t filter_size = 0;
// total raw key size

@ -33,28 +33,65 @@ namespace rocksdb {
//
// If any errors are detected, returns nullptr. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
static inline const char* DecodeEntry(const char* p, const char* limit,
uint32_t* shared,
uint32_t* non_shared,
uint32_t* value_length) {
if (limit - p < 3) return nullptr;
*shared = reinterpret_cast<const unsigned char*>(p)[0];
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
if ((*shared | *non_shared | *value_length) < 128) {
// Fast path: all three values are encoded in one byte each
p += 3;
} else {
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
}
struct DecodeEntry {
inline const char* operator()(const char* p, const char* limit,
uint32_t* shared, uint32_t* non_shared,
uint32_t* value_length) {
// We need 2 bytes for shared and non_shared size. We also need one more
// byte either for value size or the actual value in case of value delta
// encoding.
assert(limit - p >= 3);
*shared = reinterpret_cast<const unsigned char*>(p)[0];
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
if ((*shared | *non_shared | *value_length) < 128) {
// Fast path: all three values are encoded in one byte each
p += 3;
} else {
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
return nullptr;
}
}
if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
return nullptr;
// Using an assert in place of "return null" since we should not pay the
// cost of checking for corruption on every single key decoding
assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
return p;
}
};
struct DecodeKey {
inline const char* operator()(const char* p, const char* limit,
uint32_t* shared, uint32_t* non_shared) {
uint32_t value_length;
return DecodeEntry()(p, limit, shared, non_shared, &value_length);
}
};
// In format_version 4, which is used by index blocks, the value size is not
// encoded before the entry, as the value is known to be the handle with the
// known size.
struct DecodeKeyV4 {
inline const char* operator()(const char* p, const char* limit,
uint32_t* shared, uint32_t* non_shared) {
// We need 2 bytes for shared and non_shared size. We also need one more
// byte either for value size or the actual value in case of value delta
// encoding.
if (limit - p < 3) return nullptr;
*shared = reinterpret_cast<const unsigned char*>(p)[0];
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
if ((*shared | *non_shared) < 128) {
// Fast path: all three values are encoded in one byte each
p += 2;
} else {
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
}
return p;
}
return p;
}
};
void DataBlockIter::Next() {
assert(Valid());
@ -170,7 +207,8 @@ void DataBlockIter::Seek(const Slice& target) {
return;
}
uint32_t index = 0;
bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_);
bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
comparator_);
if (!ok) {
return;
@ -198,8 +236,12 @@ void IndexBlockIter::Seek(const Slice& target) {
bool ok = false;
if (prefix_index_) {
ok = PrefixSeek(target, &index);
} else if (value_delta_encoded_) {
ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
active_comparator_);
} else {
ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, active_comparator_);
ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
active_comparator_);
}
if (!ok) {
@ -222,7 +264,8 @@ void DataBlockIter::SeekForPrev(const Slice& target) {
return;
}
uint32_t index = 0;
bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_);
bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
comparator_);
if (!ok) {
return;
@ -277,7 +320,8 @@ void IndexBlockIter::SeekToLast() {
}
}
void BlockIter::CorruptionError() {
template <class TValue>
void BlockIter<TValue>::CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
@ -298,7 +342,7 @@ bool DataBlockIter::ParseNextDataKey() {
// Decode next entry
uint32_t shared, non_shared, value_length;
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
if (p == nullptr || key_.Size() < shared) {
CorruptionError();
return false;
@ -340,10 +384,14 @@ bool DataBlockIter::ParseNextDataKey() {
}
value_ = Slice(p + non_shared, value_length);
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
if (shared == 0) {
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
}
}
// else we are in the middle of a restart interval and the restart_index_
// thus has not changed
return true;
}
}
@ -361,7 +409,12 @@ bool IndexBlockIter::ParseNextIndexKey() {
// Decode next entry
uint32_t shared, non_shared, value_length;
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
if (value_delta_encoded_) {
p = DecodeKeyV4()(p, limit, &shared, &non_shared);
value_length = 0;
} else {
p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
}
if (p == nullptr || key_.Size() < shared) {
CorruptionError();
return false;
@ -377,27 +430,69 @@ bool IndexBlockIter::ParseNextIndexKey() {
key_pinned_ = false;
}
value_ = Slice(p + non_shared, value_length);
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
if (shared == 0) {
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
}
}
// else we are in the middle of a restart interval and the restart_index_
// thus has not changed
if (value_delta_encoded_) {
assert(value_length == 0);
DecodeCurrentValue(shared);
}
return true;
}
// The format:
// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
// ...
// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
// where, k is key, v is value, and its encoding is in parenthesis.
// The format of each key is (shared_size, non_shared_size, shared, non_shared)
// The format of each value, i.e., block hanlde, is (offset, size) whenever the
// shared_size is 0, which included the first entry in each restart point.
// Otherwise the format is delta-size = block handle size - size of last block
// handle.
void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
assert(value_delta_encoded_);
const char* limit = data_ + restarts_;
if (shared == 0) {
uint64_t o, s;
const char* newp = GetVarint64Ptr(value_.data(), limit, &o);
newp = GetVarint64Ptr(newp, limit, &s);
decoded_value_ = BlockHandle(o, s);
value_ = Slice(value_.data(), newp - value_.data());
} else {
uint64_t next_value_base =
decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize;
int64_t delta;
const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta);
decoded_value_ =
BlockHandle(next_value_base, decoded_value_.size() + delta);
value_ = Slice(value_.data(), newp - value_.data());
}
}
// Binary search in restart array to find the first restart point that
// is either the last restart point with a key less than target,
// which means the key of next restart point is larger than target, or
// the first restart point with a key = target
bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
uint32_t* index, const Comparator* comp) {
template <class TValue>
template <typename DecodeKeyFunc>
bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
uint32_t right, uint32_t* index,
const Comparator* comp) {
assert(left <= right);
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
&shared, &non_shared, &value_length);
uint32_t shared, non_shared;
const char* key_ptr = DecodeKeyFunc()(
data_ + region_offset, data_ + restarts_, &shared, &non_shared);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return false;
@ -425,9 +520,13 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
// Return -1 if error.
int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
uint32_t region_offset = GetRestartPoint(block_index);
uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
&shared, &non_shared, &value_length);
uint32_t shared, non_shared;
const char* key_ptr =
value_delta_encoded_
? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
&non_shared)
: DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
&non_shared);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return 1; // Return target is smaller
@ -544,6 +643,7 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
DataBlockIter* iter, Statistics* stats,
bool /*total_order_seek*/,
bool /*key_includes_seq*/,
bool /*value_is_full*/,
BlockPrefixIndex* /*prefix_index*/) {
DataBlockIter* ret_iter;
if (iter != nullptr) {
@ -577,7 +677,7 @@ template <>
IndexBlockIter* Block::NewIterator(const Comparator* cmp,
const Comparator* ucmp, IndexBlockIter* iter,
Statistics* /*stats*/, bool total_order_seek,
bool key_includes_seq,
bool key_includes_seq, bool value_is_full,
BlockPrefixIndex* prefix_index) {
IndexBlockIter* ret_iter;
if (iter != nullptr) {
@ -597,7 +697,8 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp,
BlockPrefixIndex* prefix_index_ptr =
total_order_seek ? nullptr : prefix_index;
ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
prefix_index_ptr, key_includes_seq, cachable());
prefix_index_ptr, key_includes_seq, value_is_full,
cachable());
}
return ret_iter;

@ -35,6 +35,7 @@ namespace rocksdb {
struct BlockContents;
class Comparator;
template <class TValue>
class BlockIter;
class DataBlockIter;
class IndexBlockIter;
@ -164,6 +165,11 @@ class Block {
// If iter is null, return new Iterator
// If iter is not null, update this one and return it as Iterator*
//
// key_includes_seq, default true, means that the keys are in internal key
// format.
// value_is_full, default ture, means that no delta encoding is
// applied to values.
//
// NewIterator<DataBlockIter>
// Same as above but also updates read_amp_bitmap_ if it is not nullptr.
//
@ -175,13 +181,11 @@ class Block {
// the iterator will simply be set as "invalid", rather than returning
// the key that is just pass the target key.
template <typename TBlockIter>
TBlockIter* NewIterator(const Comparator* comparator,
const Comparator* user_comparator,
TBlockIter* iter = nullptr,
Statistics* stats = nullptr,
bool total_order_seek = true,
bool key_includes_seq = true,
BlockPrefixIndex* prefix_index = nullptr);
TBlockIter* NewIterator(
const Comparator* comparator, const Comparator* user_comparator,
TBlockIter* iter = nullptr, Statistics* stats = nullptr,
bool total_order_seek = true, bool key_includes_seq = true,
bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr);
// Report an approximation of how much memory has been used.
size_t ApproximateMemoryUsage() const;
@ -204,7 +208,8 @@ class Block {
void operator=(const Block&) = delete;
};
class BlockIter : public InternalIterator {
template <class TValue>
class BlockIter : public InternalIteratorBase<TValue> {
public:
void InitializeBase(const Comparator* comparator, const char* data,
uint32_t restarts, uint32_t num_restarts,
@ -243,10 +248,6 @@ class BlockIter : public InternalIterator {
assert(Valid());
return key_.GetKey();
}
virtual Slice value() const override {
assert(Valid());
return value_;
}
#ifndef NDEBUG
virtual ~BlockIter() {
@ -280,7 +281,8 @@ class BlockIter : public InternalIterator {
const char* data_; // underlying block contents
uint32_t num_restarts_; // Number of uint32_t entries in restart array
uint32_t restart_index_; // Index of restart block in which current_ falls
// Index of restart block in which current_ or current_-1 falls
uint32_t restart_index_;
uint32_t restarts_; // Offset of restart array (list of fixed32)
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_;
@ -316,11 +318,12 @@ class BlockIter : public InternalIterator {
void CorruptionError();
bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
uint32_t* index, const Comparator* comp);
template <typename DecodeKeyFunc>
inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
uint32_t* index, const Comparator* comp);
};
class DataBlockIter final : public BlockIter {
class DataBlockIter final : public BlockIter<Slice> {
public:
DataBlockIter()
: BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
@ -405,14 +408,14 @@ class DataBlockIter final : public BlockIter {
std::vector<CachedPrevEntry> prev_entries_;
int32_t prev_entries_idx_ = -1;
bool ParseNextDataKey();
inline bool ParseNextDataKey();
inline int Compare(const IterKey& ikey, const Slice& b) const {
return comparator_->Compare(ikey.GetInternalKey(), b);
}
};
class IndexBlockIter final : public BlockIter {
class IndexBlockIter final : public BlockIter<BlockHandle> {
public:
IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
@ -420,27 +423,46 @@ class IndexBlockIter final : public BlockIter {
assert(Valid());
return key_.GetKey();
}
// key_includes_seq, default true, means that the keys are in internal key
// format.
// value_is_full, default ture, means that no delta encoding is
// applied to values.
IndexBlockIter(const Comparator* comparator,
const Comparator* user_comparator, const char* data,
uint32_t restarts, uint32_t num_restarts,
BlockPrefixIndex* prefix_index, bool key_includes_seq,
bool block_contents_pinned)
bool value_is_full, bool block_contents_pinned)
: IndexBlockIter() {
Initialize(comparator, user_comparator, data, restarts, num_restarts,
prefix_index, key_includes_seq, block_contents_pinned);
prefix_index, key_includes_seq, block_contents_pinned,
value_is_full);
}
void Initialize(const Comparator* comparator,
const Comparator* user_comparator, const char* data,
uint32_t restarts, uint32_t num_restarts,
BlockPrefixIndex* prefix_index, bool key_includes_seq,
bool block_contents_pinned) {
bool value_is_full, bool block_contents_pinned) {
InitializeBase(comparator, data, restarts, num_restarts,
kDisableGlobalSequenceNumber, block_contents_pinned);
key_includes_seq_ = key_includes_seq;
active_comparator_ = key_includes_seq_ ? comparator_ : user_comparator;
key_.SetIsUserKey(!key_includes_seq_);
prefix_index_ = prefix_index;
value_delta_encoded_ = !value_is_full;
}
virtual BlockHandle value() const override {
assert(Valid());
if (value_delta_encoded_) {
return decoded_value_;
} else {
BlockHandle handle;
Slice v = value_;
Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v);
assert(decode_s.ok());
return handle;
}
}
virtual void Seek(const Slice& target) override;
@ -467,11 +489,25 @@ class IndexBlockIter final : public BlockIter {
void Invalidate(Status s) { InvalidateBase(s); }
private:
// Key is in InternalKey format
bool key_includes_seq_;
bool value_delta_encoded_;
// key_includes_seq_ ? comparator_ : user_comparator_
const Comparator* active_comparator_;
BlockPrefixIndex* prefix_index_;
// Whether the value is delta encoded. In that case the value is assumed to be
// BlockHandle. The first value in each restart interval is the full encoded
// BlockHandle; the restart of encoded size part of the BlockHandle. The
// offset of delta encoded BlockHandles is computed by adding the size of
// previous delta encoded values in the same restart interval to the offset of
// the first value in that restart interval.
BlockHandle decoded_value_;
bool PrefixSeek(const Slice& target, uint32_t* index);
bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
uint32_t left, uint32_t right,
uint32_t* index);
int CompareBlockKey(uint32_t block_index, const Slice& target);
inline int CompareBlockKey(uint32_t block_index, const Slice& target);
inline int Compare(const Slice& a, const Slice& b) const {
return active_comparator_->Compare(a, b);
@ -481,13 +517,11 @@ class IndexBlockIter final : public BlockIter {
return active_comparator_->Compare(ikey.GetKey(), b);
}
bool ParseNextIndexKey();
inline bool ParseNextIndexKey();
// Key is in InternalKey format
bool key_includes_seq_;
// key_includes_seq_ ? comparator_ : user_comparator_
const Comparator* active_comparator_;
BlockPrefixIndex* prefix_index_;
// When value_delta_encoded_ is enabled it decodes the value which is assumed
// to be BlockHandle and put it to decoded_value_
inline void DecodeCurrentValue(uint32_t shared);
};
} // namespace rocksdb

@ -63,6 +63,7 @@ namespace {
FilterBlockBuilder* CreateFilterBlockBuilder(
const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
const BlockBasedTableOptions& table_opt,
const bool use_delta_encoding_for_index_values,
PartitionedIndexBuilder* const p_index_builder) {
if (table_opt.filter_policy == nullptr) return nullptr;
@ -85,7 +86,7 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
return new PartitionedFilterBlockBuilder(
mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
filter_bits_builder, table_opt.index_block_restart_interval,
p_index_builder, partition_size);
use_delta_encoding_for_index_values, p_index_builder, partition_size);
} else {
return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
table_opt.whole_key_filtering,
@ -266,6 +267,7 @@ struct BlockBasedTableBuilder::Rep {
TableProperties props;
bool closed = false; // Either Finish() or Abandon() has been called.
const bool use_delta_encoding_for_index_values;
std::unique_ptr<FilterBlockBuilder> filter_builder;
char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
size_t compressed_cache_key_prefix_size;
@ -306,6 +308,8 @@ struct BlockBasedTableBuilder::Rep {
internal_prefix_transform(_moptions.prefix_extractor.get()),
compression_dict(_compression_dict),
compression_ctx(_compression_type, _compression_opts),
use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
!table_opt.block_align),
compressed_cache_key_prefix_size(0),
flush_block_policy(
table_options.flush_block_policy_factory->NewFlushBlockPolicy(
@ -317,18 +321,21 @@ struct BlockBasedTableBuilder::Rep {
if (table_options.index_type ==
BlockBasedTableOptions::kTwoLevelIndexSearch) {
p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
&internal_comparator, table_options);
&internal_comparator, use_delta_encoding_for_index_values,
table_options);
index_builder.reset(p_index_builder_);
} else {
index_builder.reset(IndexBuilder::CreateIndexBuilder(
table_options.index_type, &internal_comparator,
&this->internal_prefix_transform, table_options));
&this->internal_prefix_transform, use_delta_encoding_for_index_values,
table_options));
}
if (skip_filters) {
filter_builder = nullptr;
} else {
filter_builder.reset(CreateFilterBlockBuilder(
_ioptions, _moptions, table_options, p_index_builder_));
_ioptions, _moptions, table_options,
use_delta_encoding_for_index_values, p_index_builder_));
}
for (auto& collector_factories : *int_tbl_prop_collector_factories) {
@ -793,6 +800,8 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
}
rep_->props.index_key_is_user_key =
!rep_->index_builder->seperator_is_key_plus_seq();
rep_->props.index_value_is_delta_encoded =
rep_->use_delta_encoding_for_index_values;
rep_->props.creation_time = rep_->creation_time;
rep_->props.oldest_key_time = rep_->oldest_key_time;

@ -214,7 +214,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
const InternalKeyComparator* icomparator,
IndexReader** index_reader,
const PersistentCacheOptions& cache_options,
const int level, const bool index_key_includes_seq) {
const int level, const bool index_key_includes_seq,
const bool index_value_is_full) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -225,36 +226,37 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
if (s.ok()) {
*index_reader = new PartitionIndexReader(
table, icomparator, std::move(index_block), ioptions.statistics,
level, index_key_includes_seq);
level, index_key_includes_seq, index_value_is_full);
}
return s;
}
// return a two-level iterator: first level is on the partition index
virtual InternalIterator* NewIterator(IndexBlockIter* /*iter*/ = nullptr,
bool /*dont_care*/ = true,
bool fill_cache = true) override {
virtual InternalIteratorBase<BlockHandle>* NewIterator(
IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true,
bool fill_cache = true) override {
Statistics* kNullStats = nullptr;
// Filters are already checked before seeking the index
if (!partition_map_.empty()) {
return NewTwoLevelIterator(
new BlockBasedTable::PartitionedIndexIteratorState(
table_, &partition_map_, index_key_includes_seq_),
table_, &partition_map_, index_key_includes_seq_,
index_value_is_full_),
index_block_->NewIterator<IndexBlockIter>(
icomparator_, icomparator_->user_comparator(), nullptr,
kNullStats, true, index_key_includes_seq_));
kNullStats, true, index_key_includes_seq_, index_value_is_full_));
} else {
auto ro = ReadOptions();
ro.fill_cache = fill_cache;
bool kIsIndex = true;
return new BlockBasedTableIterator<IndexBlockIter>(
return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
table_, ro, *icomparator_,
index_block_->NewIterator<IndexBlockIter>(
icomparator_, icomparator_->user_comparator(), nullptr,
kNullStats, true, index_key_includes_seq_),
kNullStats, true, index_key_includes_seq_, index_value_is_full_),
false, true, /* prefix_extractor */ nullptr, kIsIndex,
index_key_includes_seq_);
index_key_includes_seq_, index_value_is_full_);
}
// TODO(myabandeh): Update TwoLevelIterator to be able to make use of
// on-stack BlockIter while the state is on heap. Currentlly it assumes
@ -270,7 +272,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
Statistics* kNullStats = nullptr;
index_block_->NewIterator<IndexBlockIter>(
icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true,
index_key_includes_seq_);
index_key_includes_seq_, index_value_is_full_);
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();
@ -278,14 +280,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
// Empty index.
return;
}
Slice input = biter.value();
Status s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log,
"Could not read first index partition");
return;
}
handle = biter.value();
uint64_t prefetch_off = handle.offset();
// Read the last block's offset
@ -294,36 +289,21 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
// Empty index.
return;
}
input = biter.value();
s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log,
"Could not read last index partition");
return;
}
handle = biter.value();
uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
uint64_t prefetch_len = last_off - prefetch_off;
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
auto& file = table_->rep_->file;
prefetch_buffer.reset(new FilePrefetchBuffer());
s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
static_cast<size_t>(prefetch_len));
Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
static_cast<size_t>(prefetch_len));
// After prefetch, read the partitions one by one
biter.SeekToFirst();
auto ro = ReadOptions();
Cache* block_cache = rep->table_options.block_cache.get();
for (; biter.Valid(); biter.Next()) {
input = biter.value();
s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log,
"Could not read index partition");
continue;
}
handle = biter.value();
BlockBasedTable::CachableEntry<Block> block;
Slice compression_dict;
if (rep->compression_dict_block) {
@ -374,11 +354,13 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
PartitionIndexReader(BlockBasedTable* table,
const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block, Statistics* stats,
const int /*level*/, const bool index_key_includes_seq)
const int /*level*/, const bool index_key_includes_seq,
const bool index_value_is_full)
: IndexReader(icomparator, stats),
table_(table),
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
index_key_includes_seq_(index_key_includes_seq),
index_value_is_full_(index_value_is_full) {
assert(index_block_ != nullptr);
}
BlockBasedTable* table_;
@ -386,6 +368,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
partition_map_;
const bool index_key_includes_seq_;
const bool index_value_is_full_;
};
// Index that allows binary search lookup for the first key of each block.
@ -404,7 +387,8 @@ class BinarySearchIndexReader : public IndexReader {
const InternalKeyComparator* icomparator,
IndexReader** index_reader,
const PersistentCacheOptions& cache_options,
const bool index_key_includes_seq) {
const bool index_key_includes_seq,
const bool index_value_is_full) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -415,19 +399,19 @@ class BinarySearchIndexReader : public IndexReader {
if (s.ok()) {
*index_reader = new BinarySearchIndexReader(
icomparator, std::move(index_block), ioptions.statistics,
index_key_includes_seq);
index_key_includes_seq, index_value_is_full);
}
return s;
}
virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr,
bool /*dont_care*/ = true,
bool /*dont_care*/ = true) override {
virtual InternalIteratorBase<BlockHandle>* NewIterator(
IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true,
bool /*dont_care*/ = true) override {
Statistics* kNullStats = nullptr;
return index_block_->NewIterator<IndexBlockIter>(
icomparator_, icomparator_->user_comparator(), iter, kNullStats, true,
index_key_includes_seq_);
index_key_includes_seq_, index_value_is_full_);
}
virtual size_t size() const override { return index_block_->size(); }
@ -449,31 +433,32 @@ class BinarySearchIndexReader : public IndexReader {
private:
BinarySearchIndexReader(const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block,
Statistics* stats, const bool index_key_includes_seq)
Statistics* stats, const bool index_key_includes_seq,
const bool index_value_is_full)
: IndexReader(icomparator, stats),
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
index_key_includes_seq_(index_key_includes_seq),
index_value_is_full_(index_value_is_full) {
assert(index_block_ != nullptr);
}
std::unique_ptr<Block> index_block_;
const bool index_key_includes_seq_;
const bool index_value_is_full_;
};
// Index that leverages an internal hash table to quicken the lookup for a given
// key.
class HashIndexReader : public IndexReader {
public:
static Status Create(const SliceTransform* hash_key_extractor,
const Footer& footer, RandomAccessFileReader* file,
FilePrefetchBuffer* prefetch_buffer,
const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icomparator,
const BlockHandle& index_handle,
InternalIterator* meta_index_iter,
IndexReader** index_reader,
bool /*hash_index_allow_collision*/,
const PersistentCacheOptions& cache_options,
const bool index_key_includes_seq) {
static Status Create(
const SliceTransform* hash_key_extractor, const Footer& footer,
RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icomparator, const BlockHandle& index_handle,
InternalIterator* meta_index_iter, IndexReader** index_reader,
bool /*hash_index_allow_collision*/,
const PersistentCacheOptions& cache_options,
const bool index_key_includes_seq, const bool index_value_is_full) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(
file, prefetch_buffer, footer, ReadOptions(), index_handle,
@ -489,9 +474,9 @@ class HashIndexReader : public IndexReader {
// hard error. We can still fall back to the original binary search index.
// So, Create will succeed regardless, from this point on.
auto new_index_reader =
new HashIndexReader(icomparator, std::move(index_block),
ioptions.statistics, index_key_includes_seq);
auto new_index_reader = new HashIndexReader(
icomparator, std::move(index_block), ioptions.statistics,
index_key_includes_seq, index_value_is_full);
*index_reader = new_index_reader;
// Get prefixes block
@ -545,13 +530,14 @@ class HashIndexReader : public IndexReader {
return Status::OK();
}
virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr,
bool total_order_seek = true,
bool /*dont_care*/ = true) override {
virtual InternalIteratorBase<BlockHandle>* NewIterator(
IndexBlockIter* iter = nullptr, bool total_order_seek = true,
bool /*dont_care*/ = true) override {
Statistics* kNullStats = nullptr;
return index_block_->NewIterator<IndexBlockIter>(
icomparator_, icomparator_->user_comparator(), iter, kNullStats,
total_order_seek, index_key_includes_seq_, prefix_index_.get());
total_order_seek, index_key_includes_seq_, index_value_is_full_,
prefix_index_.get());
}
virtual size_t size() const override { return index_block_->size(); }
@ -577,10 +563,12 @@ class HashIndexReader : public IndexReader {
private:
HashIndexReader(const InternalKeyComparator* icomparator,
std::unique_ptr<Block>&& index_block, Statistics* stats,
const bool index_key_includes_seq)
const bool index_key_includes_seq,
const bool index_value_is_full)
: IndexReader(icomparator, stats),
index_block_(std::move(index_block)),
index_key_includes_seq_(index_key_includes_seq) {
index_key_includes_seq_(index_key_includes_seq),
index_value_is_full_(index_value_is_full) {
assert(index_block_ != nullptr);
}
@ -591,6 +579,7 @@ class HashIndexReader : public IndexReader {
std::unique_ptr<BlockPrefixIndex> prefix_index_;
BlockContents prefixes_contents_;
const bool index_key_includes_seq_;
const bool index_value_is_full_;
};
// Helper function to setup the cache key's prefix for the Table.
@ -1030,8 +1019,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
bool disable_prefix_seek =
rep->index_type == BlockBasedTableOptions::kHashSearch &&
need_upper_bound_check;
unique_ptr<InternalIterator> iter(new_table->NewIndexIterator(
ReadOptions(), disable_prefix_seek, nullptr, &index_entry));
unique_ptr<InternalIteratorBase<BlockHandle>> iter(
new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek,
nullptr, &index_entry));
s = iter->status();
if (s.ok()) {
// This is the first call to NewIndexIterator() since we're in Open().
@ -1435,7 +1425,9 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
rep->whole_key_filtering, std::move(block), nullptr,
rep->ioptions.statistics, rep->internal_comparator, this,
rep_->table_properties == nullptr ||
!rep_->table_properties->index_key_is_user_key);
rep_->table_properties->index_key_is_user_key == 0,
rep_->table_properties == nullptr ||
rep_->table_properties->index_value_is_delta_encoded == 0);
}
case Rep::FilterType::kBlockFilter:
@ -1553,7 +1545,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
// disable_prefix_seek should be set to true when prefix_extractor found in SST
// differs from the one in mutable_cf_options and index type is HashBasedIndex
InternalIterator* BlockBasedTable::NewIndexIterator(
InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
const ReadOptions& read_options, bool disable_prefix_seek,
IndexBlockIter* input_iter, CachableEntry<IndexReader>* index_entry,
GetContext* get_context) {
@ -1592,7 +1584,8 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
input_iter->Invalidate(Status::Incomplete("no blocking io"));
return input_iter;
} else {
return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
return NewErrorInternalIterator<BlockHandle>(
Status::Incomplete("no blocking io"));
}
}
@ -1639,7 +1632,7 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
input_iter->Invalidate(s);
return input_iter;
} else {
return NewErrorInternalIterator(s);
return NewErrorInternalIterator<BlockHandle>(s);
}
}
@ -1660,21 +1653,6 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
return iter;
}
template <typename TBlockIter>
TBlockIter* BlockBasedTable::NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const Slice& index_value,
TBlockIter* input_iter, bool is_index, bool key_includes_seq,
GetContext* get_context, FilePrefetchBuffer* prefetch_buffer) {
BlockHandle handle;
Slice input = index_value;
// We intentionally allow extra stuff in index_value so that we
// can add more features in the future.
Status s = handle.DecodeFrom(&input);
return NewDataBlockIterator<TBlockIter>(rep, ro, handle, input_iter, is_index,
key_includes_seq, get_context, s,
prefetch_buffer);
}
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
// If input_iter is null, new a iterator
@ -1683,7 +1661,8 @@ template <typename TBlockIter>
TBlockIter* BlockBasedTable::NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
TBlockIter* input_iter, bool is_index, bool key_includes_seq,
GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) {
bool index_key_is_full, GetContext* get_context, Status s,
FilePrefetchBuffer* prefetch_buffer) {
PERF_TIMER_GUARD(new_table_block_iter_nanos);
const bool no_io = (ro.read_tier == kBlockCacheTier);
@ -1733,7 +1712,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
const bool kTotalOrderSeek = true;
iter = block.value->NewIterator<TBlockIter>(
&rep->internal_comparator, rep->internal_comparator.user_comparator(),
iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq);
iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
index_key_is_full);
if (block.cache_handle != nullptr) {
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
block.cache_handle);
@ -1848,22 +1828,20 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
BlockBasedTable* table,
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
bool index_key_includes_seq)
bool index_key_includes_seq, bool index_key_is_full)
: table_(table),
block_map_(block_map),
index_key_includes_seq_(index_key_includes_seq) {}
index_key_includes_seq_(index_key_includes_seq),
index_key_is_full_(index_key_is_full) {}
template <class TBlockIter>
const size_t BlockBasedTableIterator<TBlockIter>::kMaxReadaheadSize =
template <class TBlockIter, typename TValue>
const size_t BlockBasedTableIterator<TBlockIter, TValue>::kMaxReadaheadSize =
256 * 1024;
InternalIterator*
InternalIteratorBase<BlockHandle>*
BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
const Slice& index_value) {
const BlockHandle& handle) {
// Return a block iterator on the index partition
BlockHandle handle;
Slice input = index_value;
Status s = handle.DecodeFrom(&input);
auto rep = table_->get_rep();
auto block = block_map_->find(handle.offset());
// This is a possible scenario since block cache might not have had space
@ -1879,10 +1857,10 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
Statistics* kNullStats = nullptr;
return block->second.value->NewIterator<IndexBlockIter>(
&rep->internal_comparator, rep->internal_comparator.user_comparator(),
nullptr, kNullStats, true, index_key_includes_seq_);
nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
}
// Create an empty iterator
return new DataBlockIter();
return new IndexBlockIter();
}
// This will be broken if the user specifies an unusual implementation
@ -1955,7 +1933,7 @@ bool BlockBasedTable::PrefixMayMatch(
// Then, try find it within each block
// we already know prefix_extractor and prefix_extractor_name must match
// because `CheckPrefixMayMatch` first checks `check_filter_ == true`
unique_ptr<InternalIterator> iiter(
unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
NewIndexIterator(no_io_read_options,
/* need_upper_bound_check */ false));
iiter->Seek(internal_prefix);
@ -1988,10 +1966,7 @@ bool BlockBasedTable::PrefixMayMatch(
// after the data block corresponding to iiter->key() cannot
// possibly contain the key. Thus, the corresponding data block
// is the only on could potentially contain the prefix.
Slice handle_value = iiter->value();
BlockHandle handle;
s = handle.DecodeFrom(&handle_value);
assert(s.ok());
BlockHandle handle = iiter->value();
may_match =
filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset());
}
@ -2015,8 +1990,8 @@ bool BlockBasedTable::PrefixMayMatch(
return may_match;
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::Seek(const Slice& target) {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
is_out_of_bound_ = false;
if (!CheckPrefixMayMatch(target)) {
ResetDataIter();
@ -2045,8 +2020,9 @@ void BlockBasedTableIterator<TBlockIter>::Seek(const Slice& target) {
block_iter_.key()) <= 0));
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::SeekForPrev(const Slice& target) {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
const Slice& target) {
is_out_of_bound_ = false;
if (!CheckPrefixMayMatch(target)) {
ResetDataIter();
@ -2088,8 +2064,8 @@ void BlockBasedTableIterator<TBlockIter>::SeekForPrev(const Slice& target) {
icomp_.Compare(target, block_iter_.key()) >= 0);
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::SeekToFirst() {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
is_out_of_bound_ = false;
SavePrevIndexValue();
index_iter_->SeekToFirst();
@ -2102,8 +2078,8 @@ void BlockBasedTableIterator<TBlockIter>::SeekToFirst() {
FindKeyForward();
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::SeekToLast() {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
is_out_of_bound_ = false;
SavePrevIndexValue();
index_iter_->SeekToLast();
@ -2116,32 +2092,30 @@ void BlockBasedTableIterator<TBlockIter>::SeekToLast() {
FindKeyBackward();
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::Next() {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
assert(block_iter_points_to_real_block_);
block_iter_.Next();
FindKeyForward();
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::Prev() {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
assert(block_iter_points_to_real_block_);
block_iter_.Prev();
FindKeyBackward();
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::InitDataBlock() {
BlockHandle data_block_handle;
Slice handle_slice = index_iter_->value();
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
BlockHandle data_block_handle = index_iter_->value();
if (!block_iter_points_to_real_block_ ||
handle_slice.compare(prev_index_value_) != 0 ||
data_block_handle.offset() != prev_index_value_.offset() ||
// if previous attempt of reading the block missed cache, try again
block_iter_.status().IsIncomplete()) {
if (block_iter_points_to_real_block_) {
ResetDataIter();
}
Status s = data_block_handle.DecodeFrom(&handle_slice);
auto* rep = table_->get_rep();
// Automatically prefetch additional data when a range scan (iterator) does
@ -2173,16 +2147,17 @@ void BlockBasedTableIterator<TBlockIter>::InitDataBlock() {
}
}
Status s;
BlockBasedTable::NewDataBlockIterator<TBlockIter>(
rep, read_options_, data_block_handle, &block_iter_, is_index_,
key_includes_seq_,
key_includes_seq_, index_key_is_full_,
/* get_context */ nullptr, s, prefetch_buffer_.get());
block_iter_points_to_real_block_ = true;
}
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::FindKeyForward() {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
assert(!is_out_of_bound_);
// TODO the while loop inherits from two-level-iterator. We don't know
// whether a block can be empty so it can be replaced by an "if".
@ -2221,8 +2196,8 @@ void BlockBasedTableIterator<TBlockIter>::FindKeyForward() {
}
}
template <class TBlockIter>
void BlockBasedTableIterator<TBlockIter>::FindKeyBackward() {
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
assert(!is_out_of_bound_);
while (!block_iter_.Valid()) {
if (!block_iter_.status().ok()) {
@ -2297,11 +2272,10 @@ InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
return iter;
}
}
std::string str;
rep_->range_del_handle.EncodeTo(&str);
// The meta-block exists but isn't in uncompressed block cache (maybe
// because it is disabled), so go through the full lookup process.
return NewDataBlockIterator<DataBlockIter>(rep_, read_options, Slice(str));
return NewDataBlockIterator<DataBlockIter>(rep_, read_options,
rep_->range_del_handle);
}
bool BlockBasedTable::FullFilterKeyMayMatch(
@ -2364,7 +2338,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
auto iiter =
NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
/* index_entry */ nullptr, get_context);
std::unique_ptr<InternalIterator> iiter_unique_ptr;
std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
if (iiter != &iiter_on_stack) {
iiter_unique_ptr.reset(iiter);
}
@ -2372,12 +2346,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
bool matched = false; // if such user key mathced a key in SST
bool done = false;
for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
Slice handle_value = iiter->value();
BlockHandle handle = iiter->value();
BlockHandle handle;
bool not_exist_in_filter =
filter != nullptr && filter->IsBlockBased() == true &&
handle.DecodeFrom(&handle_value).ok() &&
!filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor,
handle.offset(), no_io);
@ -2455,9 +2427,10 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
IndexBlockIter iiter_on_stack;
auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
std::unique_ptr<InternalIterator> iiter_unique_ptr;
std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
if (iiter != &iiter_on_stack) {
iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
iiter_unique_ptr =
std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
}
if (!iiter->status().ok()) {
@ -2470,7 +2443,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
iiter->Next()) {
Slice block_handle = iiter->value();
BlockHandle block_handle = iiter->value();
const bool is_user_key = rep_->table_properties &&
rep_->table_properties->index_key_is_user_key > 0;
if (end &&
@ -2516,11 +2489,12 @@ Status BlockBasedTable::VerifyChecksum() {
}
// Check Data blocks
IndexBlockIter iiter_on_stack;
InternalIterator* iiter =
InternalIteratorBase<BlockHandle>* iiter =
NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
std::unique_ptr<InternalIterator> iiter_unique_ptr;
std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
if (iiter != &iiter_on_stack) {
iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
iiter_unique_ptr =
std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
}
if (!iiter->status().ok()) {
// error opening index iterator
@ -2530,19 +2504,41 @@ Status BlockBasedTable::VerifyChecksum() {
return s;
}
Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) {
Status BlockBasedTable::VerifyChecksumInBlocks(
InternalIteratorBase<BlockHandle>* index_iter) {
Status s;
for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
s = index_iter->status();
if (!s.ok()) {
break;
}
BlockHandle handle;
Slice input = index_iter->value();
s = handle.DecodeFrom(&input);
BlockHandle handle = index_iter->value();
BlockContents contents;
Slice dummy_comp_dict;
BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
rep_->footer, ReadOptions(), handle, &contents,
rep_->ioptions, false /* decompress */,
dummy_comp_dict /*compression dict*/,
rep_->persistent_cache_options);
s = block_fetcher.ReadBlockContents();
if (!s.ok()) {
break;
}
}
return s;
}
Status BlockBasedTable::VerifyChecksumInBlocks(
InternalIteratorBase<Slice>* index_iter) {
Status s;
for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
s = index_iter->status();
if (!s.ok()) {
break;
}
BlockHandle handle;
Slice input = index_iter->value();
s = handle.DecodeFrom(&input);
BlockContents contents;
Slice dummy_comp_dict;
BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
@ -2560,15 +2556,13 @@ Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) {
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
const Slice& key) {
std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options));
std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
NewIndexIterator(options));
iiter->Seek(key);
assert(iiter->Valid());
CachableEntry<Block> block;
BlockHandle handle;
Slice input = iiter->value();
Status s = handle.DecodeFrom(&input);
assert(s.ok());
BlockHandle handle = iiter->value();
Cache* block_cache = rep_->table_options.block_cache.get();
assert(block_cache != nullptr);
@ -2578,6 +2572,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
cache_key_storage);
Slice ckey;
Status s;
s = GetDataBlockFromCache(
cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block,
rep_->table_options.format_version,
@ -2638,14 +2633,18 @@ Status BlockBasedTable::CreateIndexReader(
rep_->ioptions, icomparator, index_reader,
rep_->persistent_cache_options, level,
rep_->table_properties == nullptr ||
rep_->table_properties->index_key_is_user_key == 0);
rep_->table_properties->index_key_is_user_key == 0,
rep_->table_properties == nullptr ||
rep_->table_properties->index_value_is_delta_encoded == 0);
}
case BlockBasedTableOptions::kBinarySearch: {
return BinarySearchIndexReader::Create(
file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
icomparator, index_reader, rep_->persistent_cache_options,
rep_->table_properties == nullptr ||
rep_->table_properties->index_key_is_user_key == 0);
rep_->table_properties->index_key_is_user_key == 0,
rep_->table_properties == nullptr ||
rep_->table_properties->index_value_is_delta_encoded == 0);
}
case BlockBasedTableOptions::kHashSearch: {
std::unique_ptr<Block> meta_guard;
@ -2665,7 +2664,9 @@ Status BlockBasedTable::CreateIndexReader(
rep_->ioptions, icomparator, index_reader,
rep_->persistent_cache_options,
rep_->table_properties == nullptr ||
rep_->table_properties->index_key_is_user_key == 0);
rep_->table_properties->index_key_is_user_key == 0,
rep_->table_properties == nullptr ||
rep_->table_properties->index_value_is_delta_encoded == 0);
}
meta_index_iter = meta_iter_guard.get();
}
@ -2676,7 +2677,9 @@ Status BlockBasedTable::CreateIndexReader(
index_reader, rep_->hash_index_allow_collision,
rep_->persistent_cache_options,
rep_->table_properties == nullptr ||
rep_->table_properties->index_key_is_user_key == 0);
rep_->table_properties->index_key_is_user_key == 0,
rep_->table_properties == nullptr ||
rep_->table_properties->index_value_is_delta_encoded == 0);
}
default: {
std::string error_message =
@ -2687,22 +2690,14 @@ Status BlockBasedTable::CreateIndexReader(
}
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
unique_ptr<InternalIterator> index_iter(NewIndexIterator(ReadOptions()));
unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
NewIndexIterator(ReadOptions()));
index_iter->Seek(key);
uint64_t result;
if (index_iter->Valid()) {
BlockHandle handle;
Slice input = index_iter->value();
Status s = handle.DecodeFrom(&input);
if (s.ok()) {
result = handle.offset();
} else {
// Strange: we can't decode the block handle in the index block.
// We'll just return the offset of the metaindex block, which is
// close to the whole file size for this case.
result = rep_->footer.metaindex_handle().offset();
}
BlockHandle handle = index_iter->value();
result = handle.offset();
} else {
// key is past the last key in the file. If table_properties is not
// available, approximate the offset by returning the offset of the
@ -2729,7 +2724,7 @@ bool BlockBasedTable::TEST_index_reader_preloaded() const {
Status BlockBasedTable::GetKVPairsFromDataBlocks(
std::vector<KVPairBlock>* kv_pair_blocks) {
std::unique_ptr<InternalIterator> blockhandles_iter(
std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
NewIndexIterator(ReadOptions()));
Status s = blockhandles_iter->status();
@ -2944,7 +2939,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
out_file->Append(
"Index Details:\n"
"--------------------------------------\n");
std::unique_ptr<InternalIterator> blockhandles_iter(
std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
NewIndexIterator(ReadOptions()));
Status s = blockhandles_iter->status();
if (!s.ok()) {
@ -2993,7 +2988,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
}
Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
std::unique_ptr<InternalIterator> blockhandles_iter(
std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
NewIndexIterator(ReadOptions()));
Status s = blockhandles_iter->status();
if (!s.ok()) {
@ -3013,9 +3008,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
break;
}
Slice bh_val = blockhandles_iter->value();
BlockHandle bh;
bh.DecodeFrom(&bh_val);
BlockHandle bh = blockhandles_iter->value();
uint64_t datablock_size = bh.size();
datablock_size_min = std::min(datablock_size_min, datablock_size);
datablock_size_max = std::max(datablock_size_max, datablock_size);

@ -51,7 +51,6 @@ struct BlockBasedTableOptions;
struct EnvOptions;
struct ReadOptions;
class GetContext;
class InternalIterator;
using std::unique_ptr;
@ -178,9 +177,9 @@ class BlockBasedTable : public TableReader {
// to
// a different object then iter and the callee has the ownership of the
// returned object.
virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr,
bool total_order_seek = true,
bool fill_cache = true) = 0;
virtual InternalIteratorBase<BlockHandle>* NewIterator(
IndexBlockIter* iter = nullptr, bool total_order_seek = true,
bool fill_cache = true) = 0;
// The size of the index.
virtual size_t size() const = 0;
@ -224,14 +223,16 @@ class BlockBasedTable : public TableReader {
static TBlockIter* NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const Slice& index_value,
TBlockIter* input_iter = nullptr, bool is_index = false,
bool key_includes_seq = true, GetContext* get_context = nullptr,
bool key_includes_seq = true, bool index_key_is_full = true,
GetContext* get_context = nullptr,
FilePrefetchBuffer* prefetch_buffer = nullptr);
template <typename TBlockIter>
static TBlockIter* NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
TBlockIter* input_iter = nullptr, bool is_index = false,
bool key_includes_seq = true, GetContext* get_context = nullptr,
Status s = Status(), FilePrefetchBuffer* prefetch_buffer = nullptr);
bool key_includes_seq = true, bool index_key_is_full = true,
GetContext* get_context = nullptr, Status s = Status(),
FilePrefetchBuffer* prefetch_buffer = nullptr);
class PartitionedIndexIteratorState;
@ -284,7 +285,7 @@ class BlockBasedTable : public TableReader {
// 2. index is not present in block cache.
// 3. We disallowed any io to be performed, that is, read_options ==
// kBlockCacheTier
InternalIterator* NewIndexIterator(
InternalIteratorBase<BlockHandle>* NewIndexIterator(
const ReadOptions& read_options, bool need_upper_bound_check = false,
IndexBlockIter* input_iter = nullptr,
CachableEntry<IndexReader>* index_entry = nullptr,
@ -353,7 +354,8 @@ class BlockBasedTable : public TableReader {
std::unique_ptr<Block>* meta_block,
std::unique_ptr<InternalIterator>* iter);
Status VerifyChecksumInBlocks(InternalIterator* index_iter);
Status VerifyChecksumInBlocks(InternalIteratorBase<Slice>* index_iter);
Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
// Create the filter from the filter block.
virtual FilterBlockReader* ReadFilter(
@ -390,14 +392,16 @@ class BlockBasedTable::PartitionedIndexIteratorState
PartitionedIndexIteratorState(
BlockBasedTable* table,
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
const bool index_key_includes_seq);
InternalIterator* NewSecondaryIterator(const Slice& index_value) override;
const bool index_key_includes_seq, const bool index_key_is_full);
InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
const BlockHandle& index_value) override;
private:
// Don't own table_
BlockBasedTable* table_;
std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
bool index_key_includes_seq_;
bool index_key_is_full_;
};
// CachableEntry represents the entries that *may* be fetched from block cache.
@ -521,16 +525,17 @@ struct BlockBasedTable::Rep {
const bool immortal_table;
};
template <class TBlockIter>
class BlockBasedTableIterator : public InternalIterator {
template <class TBlockIter, typename TValue = Slice>
class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
public:
BlockBasedTableIterator(BlockBasedTable* table,
const ReadOptions& read_options,
const InternalKeyComparator& icomp,
InternalIterator* index_iter, bool check_filter,
bool need_upper_bound_check,
InternalIteratorBase<BlockHandle>* index_iter,
bool check_filter, bool need_upper_bound_check,
const SliceTransform* prefix_extractor, bool is_index,
bool key_includes_seq = true,
bool index_key_is_full = true,
bool for_compaction = false)
: table_(table),
read_options_(read_options),
@ -543,6 +548,7 @@ class BlockBasedTableIterator : public InternalIterator {
prefix_extractor_(prefix_extractor),
is_index_(is_index),
key_includes_seq_(key_includes_seq),
index_key_is_full_(index_key_is_full),
for_compaction_(for_compaction) {}
~BlockBasedTableIterator() { delete index_iter_; }
@ -561,7 +567,7 @@ class BlockBasedTableIterator : public InternalIterator {
assert(Valid());
return block_iter_.key();
}
Slice value() const override {
TValue value() const override {
assert(Valid());
return block_iter_.value();
}
@ -618,8 +624,7 @@ class BlockBasedTableIterator : public InternalIterator {
if (block_iter_points_to_real_block_) {
// Reseek. If they end up with the same data block, we shouldn't re-fetch
// the same data block.
Slice v = index_iter_->value();
prev_index_value_.assign(v.data(), v.size());
prev_index_value_ = index_iter_->value();
}
}
@ -631,7 +636,7 @@ class BlockBasedTableIterator : public InternalIterator {
BlockBasedTable* table_;
const ReadOptions read_options_;
const InternalKeyComparator& icomp_;
InternalIterator* index_iter_;
InternalIteratorBase<BlockHandle>* index_iter_;
PinnedIteratorsManager* pinned_iters_mgr_;
TBlockIter block_iter_;
bool block_iter_points_to_real_block_;
@ -644,10 +649,10 @@ class BlockBasedTableIterator : public InternalIterator {
bool is_index_;
// If the keys in the blocks over which we iterate include 8 byte sequence
bool key_includes_seq_;
bool index_key_is_full_;
// If this iterator is created for compaction
bool for_compaction_;
// TODO use block offset instead
std::string prev_index_value_;
BlockHandle prev_index_value_;
static const size_t kInitReadaheadSize = 8 * 1024;
// Found that 256 KB readahead size provides the best performance, based on

@ -41,9 +41,11 @@
namespace rocksdb {
BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding)
BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding,
bool use_value_delta_encoding)
: block_restart_interval_(block_restart_interval),
use_delta_encoding_(use_delta_encoding),
use_value_delta_encoding_(use_value_delta_encoding),
restarts_(),
counter_(0),
finished_(false) {
@ -65,14 +67,27 @@ void BlockBuilder::Reset() {
size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
const {
size_t estimate = CurrentSizeEstimate();
estimate += key.size() + value.size();
// Note: this is an imprecise estimate as it accounts for the whole key size
// instead of non-shared key size.
estimate += key.size();
// In value delta encoding we estimate the value delta size as half the full
// value size since only the size field of block handle is encoded.
estimate +=
!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
? value.size()
: value.size() / 2;
if (counter_ >= block_restart_interval_) {
estimate += sizeof(uint32_t); // a new restart entry.
}
estimate += sizeof(int32_t); // varint for shared prefix length.
// Note: this is an imprecise estimate as we will have to encoded size, one
// for shared key and one for non-shared key.
estimate += VarintLength(key.size()); // varint for key length.
estimate += VarintLength(value.size()); // varint for value length.
if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
estimate += VarintLength(value.size()); // varint for value length.
}
return estimate;
}
@ -87,9 +102,11 @@ Slice BlockBuilder::Finish() {
return Slice(buffer_);
}
void BlockBuilder::Add(const Slice& key, const Slice& value) {
void BlockBuilder::Add(const Slice& key, const Slice& value,
const Slice* const delta_value) {
assert(!finished_);
assert(counter_ <= block_restart_interval_);
assert(!use_value_delta_encoding_ || delta_value);
size_t shared = 0; // number of bytes shared with prev key
if (counter_ >= block_restart_interval_) {
// Restart compression
@ -115,14 +132,27 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
const size_t non_shared = key.size() - shared;
const size_t curr_size = buffer_.size();
// Add "<shared><non_shared><value_size>" to buffer_
PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
static_cast<uint32_t>(non_shared),
static_cast<uint32_t>(value.size()));
if (use_value_delta_encoding_) {
// Add "<shared><non_shared>" to buffer_
PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
static_cast<uint32_t>(non_shared));
} else {
// Add "<shared><non_shared><value_size>" to buffer_
PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
static_cast<uint32_t>(non_shared),
static_cast<uint32_t>(value.size()));
}
// Add string delta to buffer_ followed by value
buffer_.append(key.data() + shared, non_shared);
buffer_.append(value.data(), value.size());
// Use value delta encoding only when the key has shared bytes. This would
// simplify the decoding, where it can figure which decoding to use simply by
// looking at the shared bytes size.
if (shared != 0 && use_value_delta_encoding_) {
buffer_.append(delta_value->data(), delta_value->size());
} else {
buffer_.append(value.data(), value.size());
}
counter_++;
estimate_ += buffer_.size() - curr_size;

@ -21,14 +21,16 @@ class BlockBuilder {
void operator=(const BlockBuilder&) = delete;
explicit BlockBuilder(int block_restart_interval,
bool use_delta_encoding = true);
bool use_delta_encoding = true,
bool use_value_delta_encoding = false);
// Reset the contents as if the BlockBuilder was just constructed.
void Reset();
// REQUIRES: Finish() has not been called since the last call to Reset().
// REQUIRES: key is larger than any previously added key
void Add(const Slice& key, const Slice& value);
void Add(const Slice& key, const Slice& value,
const Slice* const delta_value = nullptr);
// Finish building the block and return a slice that refers to the
// block contents. The returned slice will remain valid for the
@ -49,7 +51,10 @@ class BlockBuilder {
private:
const int block_restart_interval_;
//TODO(myabandeh): put it into a separate IndexBlockBuilder
const bool use_delta_encoding_;
// Refer to BlockIter::DecodeCurrentValue for format of delta encoded values
const bool use_value_delta_encoding_;
std::string buffer_; // Destination buffer
std::vector<uint32_t> restarts_; // Restart points

@ -68,6 +68,29 @@ void GenerateRandomKVs(std::vector<std::string> *keys,
}
}
// Same as GenerateRandomKVs but the values are BlockHandle
void GenerateRandomKBHs(std::vector<std::string> *keys,
std::vector<BlockHandle> *values, const int from,
const int len, const int step = 1,
const int padding_size = 0,
const int keys_share_prefix = 1) {
Random rnd(302);
uint64_t offset = 0;
// generate different prefix
for (int i = from; i < from + len; i += step) {
// generate keys that shares the prefix
for (int j = 0; j < keys_share_prefix; ++j) {
keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
uint64_t size = rnd.Uniform(1024 * 16);
BlockHandle handle(offset, size);
offset += size + kBlockTrailerSize;
values->emplace_back(handle);
}
}
}
class BlockTest : public testing::Test {};
// block test
@ -131,6 +154,84 @@ TEST_F(BlockTest, SimpleTest) {
delete iter;
}
TEST_F(BlockTest, ValueDeltaEncodingTest) {
Random rnd(301);
Options options = Options();
std::unique_ptr<InternalKeyComparator> ic;
ic.reset(new test::PlainInternalKeyComparator(options.comparator));
std::vector<std::string> keys;
std::vector<BlockHandle> values;
const bool kUseDeltaEncoding = true;
const bool kUseValueDeltaEncoding = true;
BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding);
int num_records = 100;
GenerateRandomKBHs(&keys, &values, 0, num_records);
// add a bunch of records to a block
BlockHandle last_encoded_handle;
for (int i = 0; i < num_records; i++) {
auto block_handle = values[i];
std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding);
std::string handle_delta_encoding;
PutVarsignedint64(&handle_delta_encoding,
block_handle.size() - last_encoded_handle.size());
last_encoded_handle = block_handle;
const Slice handle_delta_encoding_slice(handle_delta_encoding);
builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice);
}
// read serialized contents of the block
Slice rawblock = builder.Finish();
// create block reader
BlockContents contents;
contents.data = rawblock;
contents.cachable = false;
Block reader(std::move(contents), kDisableGlobalSequenceNumber);
const bool kTotalOrderSeek = true;
const bool kIncludesSeq = true;
const bool kValueIsFull = !kUseValueDeltaEncoding;
IndexBlockIter *kNullIter = nullptr;
Statistics *kNullStats = nullptr;
// read contents of block sequentially
int count = 0;
InternalIteratorBase<BlockHandle> *iter = reader.NewIterator<IndexBlockIter>(
options.comparator, options.comparator, kNullIter, kNullStats,
kTotalOrderSeek, kIncludesSeq, kValueIsFull);
for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
// read kv from block
Slice k = iter->key();
BlockHandle handle = iter->value();
// compare with lookaside array
ASSERT_EQ(k.ToString().compare(keys[count]), 0);
ASSERT_EQ(values[count].offset(), handle.offset());
ASSERT_EQ(values[count].size(), handle.size());
}
delete iter;
// read block contents randomly
iter = reader.NewIterator<IndexBlockIter>(
options.comparator, options.comparator, kNullIter, kNullStats,
kTotalOrderSeek, kIncludesSeq, kValueIsFull);
for (int i = 0; i < num_records; i++) {
// find a random key in the lookaside array
int index = rnd.Uniform(num_records);
Slice k(keys[index]);
// search in block for this key
iter->Seek(k);
ASSERT_TRUE(iter->Valid());
BlockHandle handle = iter->value();
ASSERT_EQ(values[index].offset(), handle.offset());
ASSERT_EQ(values[index].size(), handle.size());
}
delete iter;
}
// return the block contents
BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
const std::vector<std::string> &keys,

@ -374,15 +374,12 @@ Slice CuckooTableIterator::value() const {
return curr_value_;
}
extern InternalIterator* NewErrorInternalIterator(const Status& status,
Arena* arena);
InternalIterator* CuckooTableReader::NewIterator(
const ReadOptions& /*read_options*/,
const SliceTransform* /* prefix_extractor */, Arena* arena,
bool /*skip_filters*/, bool /*for_compaction*/) {
if (!status().ok()) {
return NewErrorInternalIterator(
return NewErrorInternalIterator<Slice>(
Status::Corruption("CuckooTableReader status is not okay."), arena);
}
CuckooTableIterator* iter;

@ -25,7 +25,6 @@ namespace rocksdb {
class Arena;
class TableReader;
class InternalIterator;
class CuckooTableReader: public TableReader {
public:

@ -54,6 +54,13 @@ void BlockHandle::EncodeTo(std::string* dst) const {
PutVarint64Varint64(dst, offset_, size_);
}
void BlockHandle::EncodeSizeTo(std::string* dst) const {
// Sanity check that all fields have been set
assert(offset_ != ~static_cast<uint64_t>(0));
assert(size_ != ~static_cast<uint64_t>(0));
PutVarint64(dst, size_);
}
Status BlockHandle::DecodeFrom(Slice* input) {
if (GetVarint64(input, &offset_) &&
GetVarint64(input, &size_)) {
@ -66,6 +73,18 @@ Status BlockHandle::DecodeFrom(Slice* input) {
}
}
Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
if (GetVarint64(input, &size_)) {
offset_ = _offset;
return Status::OK();
} else {
// reset in case failure after partially decoding
offset_ = 0;
size_ = 0;
return Status::Corruption("bad block handle");
}
}
// Return a string that contains the copy of handle.
std::string BlockHandle::ToString(bool hex) const {
std::string handle_str;

@ -54,6 +54,8 @@ class BlockHandle {
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* input);
Status DecodeSizeFrom(uint64_t offset, Slice* input);
void EncodeSizeTo(std::string* dst) const;
// Return a string that contains the copy of handle.
std::string ToString(bool hex = true) const;
@ -90,7 +92,7 @@ inline uint32_t GetCompressFormatForVersion(
}
inline bool BlockBasedTableSupportedVersion(uint32_t version) {
return version <= 3;
return version <= 4;
}
// Footer encapsulates the fixed information stored at the tail

@ -27,23 +27,26 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
BlockBasedTableOptions::IndexType index_type,
const InternalKeyComparator* comparator,
const InternalKeySliceTransform* int_key_slice_transform,
const bool use_value_delta_encoding,
const BlockBasedTableOptions& table_opt) {
IndexBuilder* result = nullptr;
switch (index_type) {
case BlockBasedTableOptions::kBinarySearch: {
result = new ShortenedIndexBuilder(comparator,
table_opt.index_block_restart_interval,
table_opt.format_version);
result = new ShortenedIndexBuilder(
comparator, table_opt.index_block_restart_interval,
table_opt.format_version, use_value_delta_encoding);
}
break;
case BlockBasedTableOptions::kHashSearch: {
result = new HashIndexBuilder(comparator, int_key_slice_transform,
table_opt.index_block_restart_interval,
table_opt.format_version);
table_opt.format_version,
use_value_delta_encoding);
}
break;
case BlockBasedTableOptions::kTwoLevelIndexSearch: {
result = PartitionedIndexBuilder::CreateIndexBuilder(comparator, table_opt);
result = PartitionedIndexBuilder::CreateIndexBuilder(
comparator, use_value_delta_encoding, table_opt);
}
break;
default: {
@ -56,21 +59,27 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
const InternalKeyComparator* comparator,
const bool use_value_delta_encoding,
const BlockBasedTableOptions& table_opt) {
return new PartitionedIndexBuilder(comparator, table_opt);
return new PartitionedIndexBuilder(comparator, table_opt,
use_value_delta_encoding);
}
PartitionedIndexBuilder::PartitionedIndexBuilder(
const InternalKeyComparator* comparator,
const BlockBasedTableOptions& table_opt)
const BlockBasedTableOptions& table_opt,
const bool use_value_delta_encoding)
: IndexBuilder(comparator),
index_block_builder_(table_opt.index_block_restart_interval,
table_opt.format_version),
true /*use_delta_encoding*/,
use_value_delta_encoding),
index_block_builder_without_seq_(table_opt.index_block_restart_interval,
table_opt.format_version),
true /*use_delta_encoding*/,
use_value_delta_encoding),
sub_index_builder_(nullptr),
table_opt_(table_opt),
seperator_is_key_plus_seq_(false) {}
seperator_is_key_plus_seq_(false),
use_value_delta_encoding_(use_value_delta_encoding) {}
PartitionedIndexBuilder::~PartitionedIndexBuilder() {
delete sub_index_builder_;
@ -80,7 +89,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
assert(sub_index_builder_ == nullptr);
sub_index_builder_ = new ShortenedIndexBuilder(
comparator_, table_opt_.index_block_restart_interval,
table_opt_.format_version);
table_opt_.format_version, use_value_delta_encoding_);
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
sub_index_builder_->index_block_builder_));
@ -149,10 +158,18 @@ Status PartitionedIndexBuilder::Finish(
Entry& last_entry = entries_.front();
std::string handle_encoding;
last_partition_block_handle.EncodeTo(&handle_encoding);
index_block_builder_.Add(last_entry.key, handle_encoding);
std::string handle_delta_encoding;
PutVarsignedint64(
&handle_delta_encoding,
last_partition_block_handle.size() - last_encoded_handle_.size());
last_encoded_handle_ = last_partition_block_handle;
const Slice handle_delta_encoding_slice(handle_delta_encoding);
index_block_builder_.Add(last_entry.key, handle_encoding,
&handle_delta_encoding_slice);
if (!seperator_is_key_plus_seq_) {
index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
handle_encoding);
handle_encoding,
&handle_delta_encoding_slice);
}
entries_.pop_front();
}
@ -202,9 +219,12 @@ size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize(
uint64_t size = it->value->EstimatedSize();
BlockHandle tmp_block_handle(offset, size);
tmp_block_handle.EncodeTo(&tmp_handle_encoding);
std::string handle_delta_encoding;
tmp_block_handle.EncodeSizeTo(&handle_delta_encoding);
const Slice handle_delta_encoding_slice(handle_delta_encoding);
tmp_builder.Add(
seperator_is_key_plus_seq_ ? it->key : ExtractUserKey(it->key),
tmp_handle_encoding);
tmp_handle_encoding, &handle_delta_encoding_slice);
offset += size;
}
return tmp_builder.CurrentSizeEstimate();

@ -38,6 +38,7 @@ class IndexBuilder {
BlockBasedTableOptions::IndexType index_type,
const rocksdb::InternalKeyComparator* comparator,
const InternalKeySliceTransform* int_key_slice_transform,
const bool use_value_delta_encoding,
const BlockBasedTableOptions& table_opt);
// Index builder will construct a set of blocks which contain:
@ -117,11 +118,16 @@ class IndexBuilder {
class ShortenedIndexBuilder : public IndexBuilder {
public:
explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
int index_block_restart_interval,
uint32_t format_version)
const int index_block_restart_interval,
const uint32_t format_version,
const bool use_value_delta_encoding)
: IndexBuilder(comparator),
index_block_builder_(index_block_restart_interval),
index_block_builder_without_seq_(index_block_restart_interval) {
index_block_builder_(index_block_restart_interval,
true /*use_delta_encoding*/,
use_value_delta_encoding),
index_block_builder_without_seq_(index_block_restart_interval,
true /*use_delta_encoding*/,
use_value_delta_encoding) {
// Making the default true will disable the feature for old versions
seperator_is_key_plus_seq_ = (format_version <= 2);
}
@ -145,10 +151,17 @@ class ShortenedIndexBuilder : public IndexBuilder {
std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding);
index_block_builder_.Add(sep, handle_encoding);
std::string handle_delta_encoding;
PutVarsignedint64(&handle_delta_encoding,
block_handle.size() - last_encoded_handle_.size());
assert(handle_delta_encoding.size() != 0);
last_encoded_handle_ = block_handle;
const Slice handle_delta_encoding_slice(handle_delta_encoding);
index_block_builder_.Add(sep, handle_encoding,
&handle_delta_encoding_slice);
if (!seperator_is_key_plus_seq_) {
index_block_builder_without_seq_.Add(ExtractUserKey(sep),
handle_encoding);
index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding,
&handle_delta_encoding_slice);
}
}
@ -183,6 +196,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
BlockBuilder index_block_builder_;
BlockBuilder index_block_builder_without_seq_;
bool seperator_is_key_plus_seq_;
BlockHandle last_encoded_handle_;
};
// HashIndexBuilder contains a binary-searchable primary index and the
@ -217,10 +231,10 @@ class HashIndexBuilder : public IndexBuilder {
explicit HashIndexBuilder(const InternalKeyComparator* comparator,
const SliceTransform* hash_key_extractor,
int index_block_restart_interval,
int format_version)
int format_version, bool use_value_delta_encoding)
: IndexBuilder(comparator),
primary_index_builder_(comparator, index_block_restart_interval,
format_version),
format_version, use_value_delta_encoding),
hash_key_extractor_(hash_key_extractor) {}
virtual void AddIndexEntry(std::string* last_key_in_current_block,
@ -323,10 +337,12 @@ class PartitionedIndexBuilder : public IndexBuilder {
public:
static PartitionedIndexBuilder* CreateIndexBuilder(
const rocksdb::InternalKeyComparator* comparator,
const bool use_value_delta_encoding,
const BlockBasedTableOptions& table_opt);
explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
const BlockBasedTableOptions& table_opt);
const BlockBasedTableOptions& table_opt,
const bool use_value_delta_encoding);
virtual ~PartitionedIndexBuilder();
@ -361,6 +377,8 @@ class PartitionedIndexBuilder : public IndexBuilder {
return seperator_is_key_plus_seq_;
}
bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
private:
void MakeNewSubIndexBuilder();
@ -380,10 +398,12 @@ class PartitionedIndexBuilder : public IndexBuilder {
bool finishing_indexes = false;
const BlockBasedTableOptions& table_opt_;
bool seperator_is_key_plus_seq_;
bool use_value_delta_encoding_;
// true if an external entity (such as filter partition builder) request
// cutting the next partition
bool partition_cut_requested_ = true;
// true if it should cut the next filter partition block
bool cut_filter_block = false;
BlockHandle last_encoded_handle_;
};
} // namespace rocksdb

@ -10,15 +10,17 @@
#include "rocksdb/comparator.h"
#include "rocksdb/iterator.h"
#include "rocksdb/status.h"
#include "table/format.h"
namespace rocksdb {
class PinnedIteratorsManager;
class InternalIterator : public Cleanable {
template <class TValue>
class InternalIteratorBase : public Cleanable {
public:
InternalIterator() {}
virtual ~InternalIterator() {}
InternalIteratorBase() {}
virtual ~InternalIteratorBase() {}
// An iterator is either positioned at a key/value pair, or
// not valid. This method returns true iff the iterator is valid.
@ -66,7 +68,7 @@ class InternalIterator : public Cleanable {
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: Valid()
virtual Slice value() const = 0;
virtual TValue value() const = 0;
// If an error has occurred, return it. Else return an ok status.
// If non-blocking IO is requested and this operation cannot be
@ -117,14 +119,24 @@ class InternalIterator : public Cleanable {
private:
// No copying allowed
InternalIterator(const InternalIterator&) = delete;
InternalIterator& operator=(const InternalIterator&) = delete;
InternalIteratorBase(const InternalIteratorBase&) = delete;
InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
};
using InternalIterator = InternalIteratorBase<Slice>;
// Return an empty iterator (yields nothing).
extern InternalIterator* NewEmptyInternalIterator();
template <class TValue = Slice>
extern InternalIteratorBase<TValue>* NewEmptyInternalIterator();
// Return an empty iterator with the specified status.
extern InternalIterator* NewErrorInternalIterator(const Status& status);
template <class TValue = Slice>
extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
const Status& status);
// Return an empty iterator with the specified status, allocated arena.
template <class TValue = Slice>
extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
const Status& status, Arena* arena);
} // namespace rocksdb

@ -131,7 +131,8 @@ class EmptyIterator : public Iterator {
Status status_;
};
class EmptyInternalIterator : public InternalIterator {
template <class TValue = Slice>
class EmptyInternalIterator : public InternalIteratorBase<TValue> {
public:
explicit EmptyInternalIterator(const Status& s) : status_(s) {}
virtual bool Valid() const override { return false; }
@ -145,9 +146,9 @@ class EmptyInternalIterator : public InternalIterator {
assert(false);
return Slice();
}
Slice value() const override {
TValue value() const override {
assert(false);
return Slice();
return TValue();
}
virtual Status status() const override { return status_; }
@ -164,30 +165,48 @@ Iterator* NewErrorIterator(const Status& status) {
return new EmptyIterator(status);
}
InternalIterator* NewEmptyInternalIterator() {
return new EmptyInternalIterator(Status::OK());
template <class TValue>
InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
return new EmptyInternalIterator<TValue>(status);
}
InternalIterator* NewEmptyInternalIterator(Arena* arena) {
template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
const Status& status);
template InternalIteratorBase<Slice>* NewErrorInternalIterator(
const Status& status);
template <class TValue>
InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
Arena* arena) {
if (arena == nullptr) {
return NewEmptyInternalIterator();
return NewErrorInternalIterator<TValue>(status);
} else {
auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
return new (mem) EmptyInternalIterator(Status::OK());
return new (mem) EmptyInternalIterator<TValue>(status);
}
}
InternalIterator* NewErrorInternalIterator(const Status& status) {
return new EmptyInternalIterator(status);
template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
const Status& status, Arena* arena);
template InternalIteratorBase<Slice>* NewErrorInternalIterator(
const Status& status, Arena* arena);
template <class TValue>
InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
return new EmptyInternalIterator<TValue>(Status::OK());
}
template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator();
template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena) {
template <class TValue>
InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
if (arena == nullptr) {
return NewErrorInternalIterator(status);
return NewEmptyInternalIterator<TValue>();
} else {
auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
return new (mem) EmptyInternalIterator(status);
return new (mem) EmptyInternalIterator<TValue>(Status::OK());
}
}
template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator(
Arena* arena);
template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
} // namespace rocksdb

@ -19,19 +19,21 @@ namespace rocksdb {
// the valid() and key() results for an underlying iterator.
// This can help avoid virtual function calls and also gives better
// cache locality.
class IteratorWrapper {
template <class TValue = Slice>
class IteratorWrapperBase {
public:
IteratorWrapper() : iter_(nullptr), valid_(false) {}
explicit IteratorWrapper(InternalIterator* _iter) : iter_(nullptr) {
IteratorWrapperBase() : iter_(nullptr), valid_(false) {}
explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter)
: iter_(nullptr) {
Set(_iter);
}
~IteratorWrapper() {}
InternalIterator* iter() const { return iter_; }
~IteratorWrapperBase() {}
InternalIteratorBase<TValue>* iter() const { return iter_; }
// Set the underlying Iterator to _iter and return
// previous underlying Iterator.
InternalIterator* Set(InternalIterator* _iter) {
InternalIterator* old_iter = iter_;
InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) {
InternalIteratorBase<TValue>* old_iter = iter_;
iter_ = _iter;
if (iter_ == nullptr) {
@ -47,7 +49,7 @@ class IteratorWrapper {
if (!is_arena_mode) {
delete iter_;
} else {
iter_->~InternalIterator();
iter_->~InternalIteratorBase<TValue>();
}
}
}
@ -55,7 +57,10 @@ class IteratorWrapper {
// Iterator interface methods
bool Valid() const { return valid_; }
Slice key() const { assert(Valid()); return key_; }
Slice value() const { assert(Valid()); return iter_->value(); }
TValue value() const {
assert(Valid());
return iter_->value();
}
// Methods below require iter() != nullptr
Status status() const { assert(iter_); return iter_->status(); }
void Next() { assert(iter_); iter_->Next(); Update(); }
@ -91,17 +96,16 @@ class IteratorWrapper {
}
}
InternalIterator* iter_;
InternalIteratorBase<TValue>* iter_;
bool valid_;
Slice key_;
};
using IteratorWrapper = IteratorWrapperBase<Slice>;
class Arena;
// Return an empty iterator (yields nothing) allocated from arena.
extern InternalIterator* NewEmptyInternalIterator(Arena* arena);
// Return an empty iterator with the specified status, allocated arena.
extern InternalIterator* NewErrorInternalIterator(const Status& status,
Arena* arena);
template <class TValue = Slice>
extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena);
} // namespace rocksdb

@ -387,7 +387,7 @@ InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
Arena* arena, bool prefix_seek_mode) {
assert(n >= 0);
if (n == 0) {
return NewEmptyInternalIterator(arena);
return NewEmptyInternalIterator<Slice>(arena);
} else if (n == 1) {
return list[0];
} else {

@ -15,9 +15,11 @@
namespace rocksdb {
class Comparator;
class InternalIterator;
class Env;
class Arena;
template <class TValue>
class InternalIteratorBase;
using InternalIterator = InternalIteratorBase<Slice>;
// Return an iterator that provided the union of the data in
// children[0,n-1]. Takes ownership of the child iterators and

@ -76,6 +76,8 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
}
Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
props.index_value_is_delta_encoded);
Add(TablePropertiesNames::kNumEntries, props.num_entries);
Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
@ -218,6 +220,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
&new_table_properties->top_level_index_size},
{TablePropertiesNames::kIndexKeyIsUserKey,
&new_table_properties->index_key_is_user_key},
{TablePropertiesNames::kIndexValueIsDeltaEncoded,
&new_table_properties->index_value_is_delta_encoded},
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
{TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
{TablePropertiesNames::kRawValueSize,

@ -27,7 +27,6 @@ class Footer;
class Logger;
class RandomAccessFile;
struct TableProperties;
class InternalIterator;
class MetaIndexBuilder {
public:

@ -26,12 +26,17 @@ namespace rocksdb {
PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
const SliceTransform* prefix_extractor, bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
const bool use_value_delta_encoding,
PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size)
: FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
filter_bits_builder),
index_on_filter_block_builder_(index_block_restart_interval),
index_on_filter_block_builder_without_seq_(index_block_restart_interval),
index_on_filter_block_builder_(index_block_restart_interval,
true /*use_delta_encoding*/,
use_value_delta_encoding),
index_on_filter_block_builder_without_seq_(index_block_restart_interval,
true /*use_delta_encoding*/,
use_value_delta_encoding),
p_index_builder_(p_index_builder),
filters_in_partition_(0),
num_added_(0) {
@ -73,10 +78,15 @@ Slice PartitionedFilterBlockBuilder::Finish(
FilterEntry& last_entry = filters.front();
std::string handle_encoding;
last_partition_block_handle.EncodeTo(&handle_encoding);
index_on_filter_block_builder_.Add(last_entry.key, handle_encoding);
std::string handle_delta_encoding;
last_partition_block_handle.EncodeSizeTo(&handle_delta_encoding);
const Slice handle_delta_encoding_slice(handle_delta_encoding);
index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
&handle_delta_encoding_slice);
if (!p_index_builder_->seperator_is_key_plus_seq()) {
index_on_filter_block_builder_without_seq_.Add(
ExtractUserKey(last_entry.key), handle_encoding);
ExtractUserKey(last_entry.key), handle_encoding,
&handle_delta_encoding_slice);
}
filters.pop_front();
} else {
@ -109,12 +119,14 @@ PartitionedFilterBlockReader::PartitionedFilterBlockReader(
const SliceTransform* prefix_extractor, bool _whole_key_filtering,
BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/,
Statistics* stats, const InternalKeyComparator comparator,
const BlockBasedTable* table, const bool index_key_includes_seq)
const BlockBasedTable* table, const bool index_key_includes_seq,
const bool index_value_is_full)
: FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
prefix_extractor_(prefix_extractor),
comparator_(comparator),
table_(table),
index_key_includes_seq_(index_key_includes_seq) {
index_key_includes_seq_(index_key_includes_seq),
index_value_is_full_(index_value_is_full) {
idx_on_fltr_blk_.reset(new Block(std::move(contents),
kDisableGlobalSequenceNumber,
0 /* read_amp_bytes_per_bit */, stats));
@ -134,15 +146,10 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
Statistics* kNullStats = nullptr;
idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
&comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
index_key_includes_seq_);
index_key_includes_seq_, index_value_is_full_);
biter.SeekToFirst();
for (; biter.Valid(); biter.Next()) {
auto input = biter.value();
auto s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
continue;
}
handle = biter.value();
auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
table_->rep_->cache_key_prefix_size,
handle, cache_key);
@ -168,7 +175,7 @@ bool PartitionedFilterBlockReader::KeyMayMatch(
}
bool cached = false;
auto filter_partition =
GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io,
GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
&cached, prefix_extractor);
if (UNLIKELY(!filter_partition.value)) {
return true;
@ -207,7 +214,7 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
}
bool cached = false;
auto filter_partition =
GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io,
GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
&cached, prefix_extractor);
if (UNLIKELY(!filter_partition.value)) {
return true;
@ -225,29 +232,26 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
return res;
}
Slice PartitionedFilterBlockReader::GetFilterPartitionHandle(
BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
const Slice& entry) {
IndexBlockIter iter;
Statistics* kNullStats = nullptr;
idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
&comparator_, comparator_.user_comparator(), &iter, kNullStats, true,
index_key_includes_seq_);
index_key_includes_seq_, index_value_is_full_);
iter.Seek(entry);
if (UNLIKELY(!iter.Valid())) {
return Slice();
return BlockHandle(0, 0);
}
assert(iter.Valid());
Slice handle_value = iter.value();
return handle_value;
BlockHandle fltr_blk_handle = iter.value();
return fltr_blk_handle;
}
BlockBasedTable::CachableEntry<FilterBlockReader>
PartitionedFilterBlockReader::GetFilterPartition(
FilePrefetchBuffer* prefetch_buffer, Slice* handle_value, const bool no_io,
bool* cached, const SliceTransform* prefix_extractor) {
BlockHandle fltr_blk_handle;
auto s = fltr_blk_handle.DecodeFrom(handle_value);
assert(s.ok());
FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
const bool no_io, bool* cached, const SliceTransform* prefix_extractor) {
const bool is_a_filter_partition = true;
auto block_cache = table_->rep_->table_options.block_cache.get();
if (LIKELY(block_cache != nullptr)) {
@ -299,39 +303,25 @@ void PartitionedFilterBlockReader::CacheDependencies(
// Before read partitions, prefetch them to avoid lots of IOs
auto rep = table_->rep_;
IndexBlockIter biter;
BlockHandle handle;
Statistics* kNullStats = nullptr;
idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
&comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
index_key_includes_seq_);
index_key_includes_seq_, index_value_is_full_);
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();
Slice input = biter.value();
Status s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log,
"Could not read first index partition");
return;
}
BlockHandle handle = biter.value();
uint64_t prefetch_off = handle.offset();
// Read the last block's offset
biter.SeekToLast();
input = biter.value();
s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log,
"Could not read last index partition");
return;
}
handle = biter.value();
uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
uint64_t prefetch_len = last_off - prefetch_off;
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
auto& file = table_->rep_->file;
prefetch_buffer.reset(new FilePrefetchBuffer());
Status s;
s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
static_cast<size_t>(prefetch_len));
@ -339,14 +329,7 @@ void PartitionedFilterBlockReader::CacheDependencies(
biter.SeekToFirst();
Cache* block_cache = rep->table_options.block_cache.get();
for (; biter.Valid(); biter.Next()) {
input = biter.value();
s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition");
continue;
}
handle = biter.value();
const bool no_io = true;
const bool is_a_filter_partition = true;
auto filter = table_->GetFilter(

@ -26,6 +26,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
explicit PartitionedFilterBlockBuilder(
const SliceTransform* prefix_extractor, bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
const bool use_value_delta_encoding,
PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size);
@ -74,7 +75,8 @@ class PartitionedFilterBlockReader : public FilterBlockReader,
const SliceTransform* prefix_extractor, bool whole_key_filtering,
BlockContents&& contents, FilterBitsReader* filter_bits_reader,
Statistics* stats, const InternalKeyComparator comparator,
const BlockBasedTable* table, const bool index_key_includes_seq);
const BlockBasedTable* table, const bool index_key_includes_seq,
const bool index_value_is_full);
virtual ~PartitionedFilterBlockReader();
virtual bool IsBlockBased() override { return false; }
@ -89,10 +91,11 @@ class PartitionedFilterBlockReader : public FilterBlockReader,
virtual size_t ApproximateMemoryUsage() const override;
private:
Slice GetFilterPartitionHandle(const Slice& entry);
BlockHandle GetFilterPartitionHandle(const Slice& entry);
BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition(
FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io,
bool* cached, const SliceTransform* prefix_extractor = nullptr);
FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle,
const bool no_io, bool* cached,
const SliceTransform* prefix_extractor = nullptr);
virtual void CacheDependencies(
bool bin, const SliceTransform* prefix_extractor) override;
@ -101,6 +104,7 @@ class PartitionedFilterBlockReader : public FilterBlockReader,
const InternalKeyComparator comparator_;
const BlockBasedTable* table_;
const bool index_key_includes_seq_;
const bool index_value_is_full_;
std::unordered_map<uint64_t,
BlockBasedTable::CachableEntry<FilterBlockReader>>
filter_map_;

@ -100,7 +100,9 @@ class PartitionedFilterBlockTest : public testing::Test {
}
PartitionedIndexBuilder* NewIndexBuilder() {
return PartitionedIndexBuilder::CreateIndexBuilder(&icomp, table_options_);
const bool kValueDeltaEncoded = true;
return PartitionedIndexBuilder::CreateIndexBuilder(
&icomp, !kValueDeltaEncoded, table_options_);
}
PartitionedFilterBlockBuilder* NewBuilder(
@ -113,11 +115,12 @@ class PartitionedFilterBlockTest : public testing::Test {
99) /
100);
partition_size = std::max(partition_size, static_cast<uint32_t>(1));
const bool kValueDeltaEncoded = true;
return new PartitionedFilterBlockBuilder(
prefix_extractor, table_options_.whole_key_filtering,
table_options_.filter_policy->GetFilterBitsBuilder(),
table_options_.index_block_restart_interval, p_index_builder,
partition_size);
table_options_.index_block_restart_interval, !kValueDeltaEncoded,
p_index_builder, partition_size);
}
std::unique_ptr<MockedBlockBasedTable> table;
@ -143,7 +146,8 @@ class PartitionedFilterBlockTest : public testing::Test {
!kSkipFilters, !kImmortal)));
auto reader = new PartitionedFilterBlockReader(
prefix_extractor, true, BlockContents(slice, false, kNoCompression),
nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq());
nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq(),
!pib->get_use_value_delta_encoding());
return reader;
}

@ -38,7 +38,6 @@ class TableReader;
class InternalKeyComparator;
class PlainTableKeyDecoder;
class GetContext;
class InternalIterator;
using std::unique_ptr;
using std::unordered_map;

@ -94,8 +94,9 @@ std::string TableProperties::ToString(
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
char index_block_size_str[80];
snprintf(index_block_size_str, sizeof(index_block_size_str),
"index block size (user-key? %d)",
static_cast<int>(index_key_is_user_key));
"index block size (user-key? %d, delta-value? %d)",
static_cast<int>(index_key_is_user_key),
static_cast<int>(index_value_is_delta_encoded));
AppendProperty(result, index_block_size_str, index_size, prop_delim,
kv_delim);
if (index_partitions != 0) {
@ -163,6 +164,7 @@ void TableProperties::Add(const TableProperties& tp) {
index_partitions += tp.index_partitions;
top_level_index_size += tp.top_level_index_size;
index_key_is_user_key += tp.index_key_is_user_key;
index_value_is_delta_encoded += tp.index_value_is_delta_encoded;
filter_size += tp.filter_size;
raw_key_size += tp.raw_key_size;
raw_value_size += tp.raw_value_size;
@ -181,6 +183,8 @@ const std::string TablePropertiesNames::kTopLevelIndexSize =
"rocksdb.top-level.index.size";
const std::string TablePropertiesNames::kIndexKeyIsUserKey =
"rocksdb.index.key.is.user.key";
const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded =
"rocksdb.index.value.is.delta.encoded";
const std::string TablePropertiesNames::kFilterSize =
"rocksdb.filter.size";
const std::string TablePropertiesNames::kRawKeySize =

@ -10,7 +10,6 @@
namespace rocksdb {
class InternalIterator;
class BlockHandle;
// Seek to the properties block.

@ -21,7 +21,6 @@ class Arena;
struct ReadOptions;
struct TableProperties;
class GetContext;
class InternalIterator;
// A Table is a sorted map from strings to strings. Tables are
// immutable and persistent. A Table may be safely accessed from

@ -2988,9 +2988,13 @@ TEST_F(HarnessTest, FooterTests) {
class IndexBlockRestartIntervalTest
: public TableTest,
public ::testing::WithParamInterface<int> {
public ::testing::WithParamInterface<std::pair<int, bool>> {
public:
static std::vector<int> GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; }
static std::vector<std::pair<int, bool>> GetRestartValues() {
return {{-1, false}, {0, false}, {1, false}, {8, false},
{16, false}, {32, false}, {-1, true}, {0, true},
{1, true}, {8, true}, {16, true}, {32, true}};
}
};
INSTANTIATE_TEST_CASE_P(
@ -3002,12 +3006,16 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
const int kKeySize = 100;
const int kValSize = 500;
int index_block_restart_interval = GetParam();
const int index_block_restart_interval = std::get<0>(GetParam());
const bool value_delta_encoding = std::get<1>(GetParam());
Options options;
BlockBasedTableOptions table_options;
table_options.block_size = 64; // small block size to get big index block
table_options.index_block_restart_interval = index_block_restart_interval;
if (value_delta_encoding) {
table_options.format_version = 4;
}
options.table_factory.reset(new BlockBasedTableFactory(table_options));
TableConstructor c(BytewiseComparator());

@ -19,12 +19,13 @@ namespace rocksdb {
namespace {
class TwoLevelIterator : public InternalIterator {
class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
public:
explicit TwoLevelIterator(TwoLevelIteratorState* state,
InternalIterator* first_level_iter);
explicit TwoLevelIndexIterator(
TwoLevelIteratorState* state,
InternalIteratorBase<BlockHandle>* first_level_iter);
virtual ~TwoLevelIterator() {
virtual ~TwoLevelIndexIterator() {
first_level_iter_.DeleteIter(false /* is_arena_mode */);
second_level_iter_.DeleteIter(false /* is_arena_mode */);
delete state_;
@ -42,7 +43,7 @@ class TwoLevelIterator : public InternalIterator {
assert(Valid());
return second_level_iter_.key();
}
virtual Slice value() const override {
virtual BlockHandle value() const override {
assert(Valid());
return second_level_iter_.value();
}
@ -68,23 +69,24 @@ class TwoLevelIterator : public InternalIterator {
}
void SkipEmptyDataBlocksForward();
void SkipEmptyDataBlocksBackward();
void SetSecondLevelIterator(InternalIterator* iter);
void SetSecondLevelIterator(InternalIteratorBase<BlockHandle>* iter);
void InitDataBlock();
TwoLevelIteratorState* state_;
IteratorWrapper first_level_iter_;
IteratorWrapper second_level_iter_; // May be nullptr
IteratorWrapperBase<BlockHandle> first_level_iter_;
IteratorWrapperBase<BlockHandle> second_level_iter_; // May be nullptr
Status status_;
// If second_level_iter is non-nullptr, then "data_block_handle_" holds the
// "index_value" passed to block_function_ to create the second_level_iter.
std::string data_block_handle_;
BlockHandle data_block_handle_;
};
TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state,
InternalIterator* first_level_iter)
TwoLevelIndexIterator::TwoLevelIndexIterator(
TwoLevelIteratorState* state,
InternalIteratorBase<BlockHandle>* first_level_iter)
: state_(state), first_level_iter_(first_level_iter) {}
void TwoLevelIterator::Seek(const Slice& target) {
void TwoLevelIndexIterator::Seek(const Slice& target) {
first_level_iter_.Seek(target);
InitDataBlock();
@ -94,7 +96,7 @@ void TwoLevelIterator::Seek(const Slice& target) {
SkipEmptyDataBlocksForward();
}
void TwoLevelIterator::SeekForPrev(const Slice& target) {
void TwoLevelIndexIterator::SeekForPrev(const Slice& target) {
first_level_iter_.Seek(target);
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
@ -112,7 +114,7 @@ void TwoLevelIterator::SeekForPrev(const Slice& target) {
}
}
void TwoLevelIterator::SeekToFirst() {
void TwoLevelIndexIterator::SeekToFirst() {
first_level_iter_.SeekToFirst();
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
@ -121,7 +123,7 @@ void TwoLevelIterator::SeekToFirst() {
SkipEmptyDataBlocksForward();
}
void TwoLevelIterator::SeekToLast() {
void TwoLevelIndexIterator::SeekToLast() {
first_level_iter_.SeekToLast();
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
@ -130,19 +132,19 @@ void TwoLevelIterator::SeekToLast() {
SkipEmptyDataBlocksBackward();
}
void TwoLevelIterator::Next() {
void TwoLevelIndexIterator::Next() {
assert(Valid());
second_level_iter_.Next();
SkipEmptyDataBlocksForward();
}
void TwoLevelIterator::Prev() {
void TwoLevelIndexIterator::Prev() {
assert(Valid());
second_level_iter_.Prev();
SkipEmptyDataBlocksBackward();
}
void TwoLevelIterator::SkipEmptyDataBlocksForward() {
void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() {
while (second_level_iter_.iter() == nullptr ||
(!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
// Move to next block
@ -158,7 +160,7 @@ void TwoLevelIterator::SkipEmptyDataBlocksForward() {
}
}
void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
while (second_level_iter_.iter() == nullptr ||
(!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
// Move to next block
@ -174,24 +176,26 @@ void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
}
}
void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) {
InternalIterator* old_iter = second_level_iter_.Set(iter);
void TwoLevelIndexIterator::SetSecondLevelIterator(
InternalIteratorBase<BlockHandle>* iter) {
InternalIteratorBase<BlockHandle>* old_iter = second_level_iter_.Set(iter);
delete old_iter;
}
void TwoLevelIterator::InitDataBlock() {
void TwoLevelIndexIterator::InitDataBlock() {
if (!first_level_iter_.Valid()) {
SetSecondLevelIterator(nullptr);
} else {
Slice handle = first_level_iter_.value();
BlockHandle handle = first_level_iter_.value();
if (second_level_iter_.iter() != nullptr &&
!second_level_iter_.status().IsIncomplete() &&
handle.compare(data_block_handle_) == 0) {
handle.offset() == data_block_handle_.offset()) {
// second_level_iter is already constructed with this iterator, so
// no need to change anything
} else {
InternalIterator* iter = state_->NewSecondaryIterator(handle);
data_block_handle_.assign(handle.data(), handle.size());
InternalIteratorBase<BlockHandle>* iter =
state_->NewSecondaryIterator(handle);
data_block_handle_ = handle;
SetSecondLevelIterator(iter);
}
}
@ -199,8 +203,9 @@ void TwoLevelIterator::InitDataBlock() {
} // namespace
InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
InternalIterator* first_level_iter) {
return new TwoLevelIterator(state, first_level_iter);
InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
TwoLevelIteratorState* state,
InternalIteratorBase<BlockHandle>* first_level_iter) {
return new TwoLevelIndexIterator(state, first_level_iter);
}
} // namespace rocksdb

@ -22,7 +22,8 @@ struct TwoLevelIteratorState {
TwoLevelIteratorState() {}
virtual ~TwoLevelIteratorState() {}
virtual InternalIterator* NewSecondaryIterator(const Slice& handle) = 0;
virtual InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
const BlockHandle& handle) = 0;
};
@ -36,7 +37,8 @@ struct TwoLevelIteratorState {
// Uses a supplied function to convert an index_iter value into
// an iterator over the contents of the corresponding block.
// Note: this function expects first_level_iter was not created using the arena
extern InternalIterator* NewTwoLevelIterator(
TwoLevelIteratorState* state, InternalIterator* first_level_iter);
extern InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
TwoLevelIteratorState* state,
InternalIteratorBase<BlockHandle>* first_level_iter);
} // namespace rocksdb

@ -64,12 +64,27 @@ extern Slice GetLengthPrefixedSlice(const char* data);
extern Slice GetSliceUntil(Slice* slice, char delimiter);
// Borrowed from https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208
constexpr inline uint64_t i64ToZigzag(const int64_t l) {
return (static_cast<uint64_t>(l) << 1) ^ static_cast<uint64_t>(l >> 63);
}
inline int64_t zigzagToI64(uint64_t n) {
return (n >> 1) ^ -static_cast<int64_t>(n & 1);
}
// Pointer-based variants of GetVarint... These either store a value
// in *v and return a pointer just past the parsed value, or return
// nullptr on error. These routines only look at bytes in the range
// [p..limit-1]
extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
int64_t* value) {
uint64_t u = 0;
const char* ret = GetVarint64Ptr(p, limit, &u);
*value = zigzagToI64(u);
return ret;
}
// Returns the length of the varint32 or varint64 encoding of "v"
extern int VarintLength(uint64_t v);
@ -249,11 +264,18 @@ inline char* EncodeVarint64(char* dst, uint64_t v) {
}
inline void PutVarint64(std::string* dst, uint64_t v) {
char buf[10];
char buf[kMaxVarint64Length];
char* ptr = EncodeVarint64(buf, v);
dst->append(buf, static_cast<size_t>(ptr - buf));
}
inline void PutVarsignedint64(std::string* dst, int64_t v) {
char buf[kMaxVarint64Length];
// Using Zigzag format to convert signed to unsigned
char* ptr = EncodeVarint64(buf, i64ToZigzag(v));
dst->append(buf, static_cast<size_t>(ptr - buf));
}
inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) {
char buf[20];
char* ptr = EncodeVarint64(buf, v1);

@ -20,7 +20,7 @@ namespace rocksdb {
namespace test {
const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
const uint32_t kLatestFormatVersion = 3u;
const uint32_t kLatestFormatVersion = 4u;
Slice RandomString(Random* rnd, int len, std::string* dst) {
dst->resize(len);

@ -406,7 +406,7 @@ class TransactionTestBase : public ::testing::Test {
if (empty_wal) {
ASSERT_OK(s);
} else {
// Test that we can detect the WAL that is produced by an incompatbile
// Test that we can detect the WAL that is produced by an incompatible
// WritePolicy and fail fast before mis-interpreting the WAL.
ASSERT_TRUE(s.IsNotSupported());
return;

Loading…
Cancel
Save