diff --git a/HISTORY.md b/HISTORY.md index 653018937..6b89342f8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Public API Change ### New Features +* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used. ### Bug Fixes ## 5.15.0 (7/17/2018) @@ -13,7 +14,7 @@ * The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries. ### New Features -* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used. +* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used. * Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1. * Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`. * Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table. diff --git a/db/builder.h b/db/builder.h index 0e8218e74..9995723df 100644 --- a/db/builder.h +++ b/db/builder.h @@ -35,7 +35,6 @@ class VersionEdit; class TableBuilder; class WritableFileWriter; class InternalStats; -class InternalIterator; // @param column_family_name Name of the column family that is also identified // by column_family_id, or empty string if unknown. It must outlive the diff --git a/db/db_impl.cc b/db/db_impl.cc index a9fde0b23..1aeb5ab68 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1047,7 +1047,7 @@ InternalIterator* DBImpl::NewInternalIterator( } else { CleanupSuperVersion(super_version); } - return NewErrorInternalIterator(s, arena); + return NewErrorInternalIterator(s, arena); } ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { diff --git a/db/db_iter.h b/db/db_iter.h index 989a0ba1f..56b6a42db 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -23,7 +23,6 @@ namespace rocksdb { class Arena; class DBIter; -class InternalIterator; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 819758e3f..bdd50ec6c 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -180,17 +180,16 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { ResetTableProperties(tp); sscanf(tp_string.c_str(), "# data blocks %" SCNu64 " # entries %" SCNu64 - " # range deletions %" SCNu64 - " raw key size %" SCNu64 + " # range deletions %" SCNu64 " raw key size %" SCNu64 " raw average key size %lf " " raw value size %" SCNu64 " raw average value size %lf " " data block size %" SCNu64 " index block size (user-key? %" SCNu64 - ") %" SCNu64 " filter block size %" SCNu64, + ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, &tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions, &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double, - &tp->data_size, &tp->index_key_is_user_key, &tp->index_size, - &tp->filter_size); + &tp->data_size, &tp->index_key_is_user_key, + &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size); } void VerifySimilar(uint64_t a, uint64_t b, double bias) { @@ -224,14 +223,11 @@ void VerifyTableProperties(const TableProperties& base_tp, ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions); } -void GetExpectedTableProperties(TableProperties* expected_tp, - const int kKeySize, const int kValueSize, - const int kKeysPerTable, - const int kRangeDeletionsPerTable, - const int kTableCount, - const int kBloomBitsPerKey, - const size_t kBlockSize, - const bool index_key_is_user_key) { +void GetExpectedTableProperties( + TableProperties* expected_tp, const int kKeySize, const int kValueSize, + const int kKeysPerTable, const int kRangeDeletionsPerTable, + const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize, + const bool index_key_is_user_key, const bool value_delta_encoding) { const int kKeyCount = kTableCount * kKeysPerTable; const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; const int kAvgSuccessorSize = kKeySize / 5; @@ -248,7 +244,9 @@ void GetExpectedTableProperties(TableProperties* expected_tp, kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); expected_tp->index_size = expected_tp->num_data_blocks * - (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8)); + (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) - + // discount 1 byte as value size is not encoded in value delta encoding + (value_delta_encoding ? 1 : 0)); expected_tp->filter_size = kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); } @@ -342,12 +340,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { TableProperties output_tp; ParseTablePropertiesString(property, &output_tp); bool index_key_is_user_key = output_tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0; TableProperties expected_tp; GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, kKeysPerTable, kRangeDeletionsPerTable, kTableCount, kBloomBitsPerKey, - table_options.block_size, index_key_is_user_key); + table_options.block_size, index_key_is_user_key, + value_is_delta_encoded); VerifyTableProperties(expected_tp, output_tp); } @@ -533,6 +533,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); ParseTablePropertiesString(tp_string, &tp); bool index_key_is_user_key = tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0; ASSERT_EQ(sum_tp.data_size, tp.data_size); ASSERT_EQ(sum_tp.index_size, tp.index_size); ASSERT_EQ(sum_tp.filter_size, tp.filter_size); @@ -542,10 +543,10 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { ASSERT_EQ(sum_tp.num_entries, tp.num_entries); ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions); if (table > 3) { - GetExpectedTableProperties( - &expected_tp, kKeySize, kValueSize, kKeysPerTable, - kRangeDeletionsPerTable, table, kBloomBitsPerKey, - table_options.block_size, index_key_is_user_key); + GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, + kKeysPerTable, kRangeDeletionsPerTable, table, + kBloomBitsPerKey, table_options.block_size, + index_key_is_user_key, value_is_delta_encoded); // Gives larger bias here as index block size, filter block size, // and data block size become much harder to estimate in this test. VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25); diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 5867713ae..90560d90e 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -449,8 +449,8 @@ Options DBTestBase::GetOptions( options.prefix_extractor.reset(NewNoopTransform()); break; } - case kBlockBasedTableWithPartitionedIndexFormat3: { - table_options.format_version = 3; + case kBlockBasedTableWithPartitionedIndexFormat4: { + table_options.format_version = 4; // Format 3 changes the binary index format. Since partitioned index is a // super-set of simple indexes, we are also using kTwoLevelIndexSearch to // test this format. diff --git a/db/db_test_util.h b/db/db_test_util.h index 56ab733bf..30ee2e0c2 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -698,7 +698,7 @@ class DBTestBase : public testing::Test { kLevelSubcompactions, kBlockBasedTableWithIndexRestartInterval, kBlockBasedTableWithPartitionedIndex, - kBlockBasedTableWithPartitionedIndexFormat3, + kBlockBasedTableWithPartitionedIndexFormat4, kPartitionedFilterWithNewTableReaderForCompactions, kUniversalSubcompactions, // This must be the last line diff --git a/db/memtable.h b/db/memtable.h index 6082a8e6c..f1dbb7012 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -34,7 +34,6 @@ namespace rocksdb { class Mutex; class MemTableIterator; class MergeContext; -class InternalIterator; struct ImmutableMemTableOptions { explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions, diff --git a/db/merge_helper.h b/db/merge_helper.h index db941808e..abb1e1756 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -26,7 +26,6 @@ class Iterator; class Logger; class MergeOperator; class Statistics; -class InternalIterator; class MergeHelper { public: diff --git a/db/table_cache.cc b/db/table_cache.cc index edfffb0e8..f374a6876 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -238,7 +238,7 @@ InternalIterator* TableCache::NewIterator( if (s.ok()) { if (options.table_filter && !options.table_filter(*table_reader->GetTableProperties())) { - result = NewEmptyInternalIterator(arena); + result = NewEmptyInternalIterator(arena); } else { result = table_reader->NewIterator(options, prefix_extractor, arena, skip_filters, for_compaction); @@ -279,7 +279,7 @@ InternalIterator* TableCache::NewIterator( } if (!s.ok()) { assert(result == nullptr); - result = NewErrorInternalIterator(s, arena); + result = NewErrorInternalIterator(s, arena); } return result; } diff --git a/db/table_cache.h b/db/table_cache.h index dcf09edff..7e7f53cc1 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -31,7 +31,6 @@ class Arena; struct FileDescriptor; class GetContext; class HistogramImpl; -class InternalIterator; class TableCache { public: diff --git a/db/version_edit.cc b/db/version_edit.cc index 40d25999e..447fbf378 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -46,7 +46,7 @@ enum CustomTag : uint32_t { kTerminate = 1, // The end of customized fields kNeedCompaction = 2, // Since Manifest is not entirely currently forward-compatible, and the only - // forward-compatbile part is the CutsomtTag of kNewFile, we currently encode + // forward-compatible part is the CutsomtTag of kNewFile, we currently encode // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be // removed when manifest becomes forward-comptabile. kMinLogNumberToKeepHack = 3, @@ -274,7 +274,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { break; case kMinLogNumberToKeepHack: // This is a hack to encode kMinLogNumberToKeep in a - // forward-compatbile fashion. + // forward-compatible fashion. if (!GetFixed64(&field, &min_log_number_to_keep_)) { return "deleted log number malformatted"; } diff --git a/db/version_set.h b/db/version_set.h index fe8f26339..f57f84fb7 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -52,7 +52,6 @@ class Writer; } class Compaction; -class InternalIterator; class LogBuffer; class LookupKey; class MemTable; diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 32ddb6c98..d545e455f 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -34,6 +34,7 @@ struct TablePropertiesNames { static const std::string kIndexPartitions; static const std::string kTopLevelIndexSize; static const std::string kIndexKeyIsUserKey; + static const std::string kIndexValueIsDeltaEncoded; static const std::string kFilterSize; static const std::string kRawKeySize; static const std::string kRawValueSize; @@ -139,6 +140,8 @@ struct TableProperties { // Whether the index key is user key. Otherwise it includes 8 byte of sequence // number added by internal key format. uint64_t index_key_is_user_key = 0; + // Whether delta encoding is used to encode the index values. + uint64_t index_value_is_delta_encoded = 0; // the size of filter block. uint64_t filter_size = 0; // total raw key size diff --git a/table/block.cc b/table/block.cc index c1891801c..0e6c2b112 100644 --- a/table/block.cc +++ b/table/block.cc @@ -33,28 +33,65 @@ namespace rocksdb { // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). -static inline const char* DecodeEntry(const char* p, const char* limit, - uint32_t* shared, - uint32_t* non_shared, - uint32_t* value_length) { - if (limit - p < 3) return nullptr; - *shared = reinterpret_cast(p)[0]; - *non_shared = reinterpret_cast(p)[1]; - *value_length = reinterpret_cast(p)[2]; - if ((*shared | *non_shared | *value_length) < 128) { - // Fast path: all three values are encoded in one byte each - p += 3; - } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; - } +struct DecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + assert(limit - p >= 3); + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } - if (static_cast(limit - p) < (*non_shared + *value_length)) { - return nullptr; + // Using an assert in place of "return null" since we should not pay the + // cost of checking for corruption on every single key decoding + assert(!(static_cast(limit - p) < (*non_shared + *value_length))); + return p; + } +}; + +struct DecodeKey { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + uint32_t value_length; + return DecodeEntry()(p, limit, shared, non_shared, &value_length); + } +}; + +// In format_version 4, which is used by index blocks, the value size is not +// encoded before the entry, as the value is known to be the handle with the +// known size. +struct DecodeKeyV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + if ((*shared | *non_shared) < 128) { + // Fast path: all three values are encoded in one byte each + p += 2; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + } + return p; } - return p; -} +}; void DataBlockIter::Next() { assert(Valid()); @@ -170,7 +207,8 @@ void DataBlockIter::Seek(const Slice& target) { return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); + bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); if (!ok) { return; @@ -198,8 +236,12 @@ void IndexBlockIter::Seek(const Slice& target) { bool ok = false; if (prefix_index_) { ok = PrefixSeek(target, &index); + } else if (value_delta_encoded_) { + ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + active_comparator_); } else { - ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, active_comparator_); + ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + active_comparator_); } if (!ok) { @@ -222,7 +264,8 @@ void DataBlockIter::SeekForPrev(const Slice& target) { return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); + bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); if (!ok) { return; @@ -277,7 +320,8 @@ void IndexBlockIter::SeekToLast() { } } -void BlockIter::CorruptionError() { +template +void BlockIter::CorruptionError() { current_ = restarts_; restart_index_ = num_restarts_; status_ = Status::Corruption("bad entry in block"); @@ -298,7 +342,7 @@ bool DataBlockIter::ParseNextDataKey() { // Decode next entry uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; @@ -340,10 +384,14 @@ bool DataBlockIter::ParseNextDataKey() { } value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed return true; } } @@ -361,7 +409,12 @@ bool IndexBlockIter::ParseNextIndexKey() { // Decode next entry uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (value_delta_encoded_) { + p = DecodeKeyV4()(p, limit, &shared, &non_shared); + value_length = 0; + } else { + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); + } if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; @@ -377,27 +430,69 @@ bool IndexBlockIter::ParseNextIndexKey() { key_pinned_ = false; } value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + if (value_delta_encoded_) { + assert(value_length == 0); + DecodeCurrentValue(shared); } return true; } +// The format: +// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// ... +// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// where, k is key, v is value, and its encoding is in parenthesis. +// The format of each key is (shared_size, non_shared_size, shared, non_shared) +// The format of each value, i.e., block hanlde, is (offset, size) whenever the +// shared_size is 0, which included the first entry in each restart point. +// Otherwise the format is delta-size = block handle size - size of last block +// handle. +void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { + assert(value_delta_encoded_); + const char* limit = data_ + restarts_; + if (shared == 0) { + uint64_t o, s; + const char* newp = GetVarint64Ptr(value_.data(), limit, &o); + newp = GetVarint64Ptr(newp, limit, &s); + decoded_value_ = BlockHandle(o, s); + value_ = Slice(value_.data(), newp - value_.data()); + } else { + uint64_t next_value_base = + decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; + int64_t delta; + const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); + decoded_value_ = + BlockHandle(next_value_base, decoded_value_.size() + delta); + value_ = Slice(value_.data(), newp - value_.data()); + } +} + // Binary search in restart array to find the first restart point that // is either the last restart point with a key less than target, // which means the key of next restart point is larger than target, or // the first restart point with a key = target -bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp) { +template +template +bool BlockIter::BinarySeek(const Slice& target, uint32_t left, + uint32_t right, uint32_t* index, + const Comparator* comp) { assert(left <= right); while (left < right) { uint32_t mid = (left + right + 1) / 2; uint32_t region_offset = GetRestartPoint(mid); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, - &shared, &non_shared, &value_length); + uint32_t shared, non_shared; + const char* key_ptr = DecodeKeyFunc()( + data_ + region_offset, data_ + restarts_, &shared, &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return false; @@ -425,9 +520,13 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, // Return -1 if error. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { uint32_t region_offset = GetRestartPoint(block_index); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, - &shared, &non_shared, &value_length); + uint32_t shared, non_shared; + const char* key_ptr = + value_delta_encoded_ + ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared) + : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return 1; // Return target is smaller @@ -544,6 +643,7 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, DataBlockIter* iter, Statistics* stats, bool /*total_order_seek*/, bool /*key_includes_seq*/, + bool /*value_is_full*/, BlockPrefixIndex* /*prefix_index*/) { DataBlockIter* ret_iter; if (iter != nullptr) { @@ -577,7 +677,7 @@ template <> IndexBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, - bool key_includes_seq, + bool key_includes_seq, bool value_is_full, BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { @@ -597,7 +697,8 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp, BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - prefix_index_ptr, key_includes_seq, cachable()); + prefix_index_ptr, key_includes_seq, value_is_full, + cachable()); } return ret_iter; diff --git a/table/block.h b/table/block.h index 8ee450ca9..a29ea1689 100644 --- a/table/block.h +++ b/table/block.h @@ -35,6 +35,7 @@ namespace rocksdb { struct BlockContents; class Comparator; +template class BlockIter; class DataBlockIter; class IndexBlockIter; @@ -164,6 +165,11 @@ class Block { // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* // + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default ture, means that no delta encoding is + // applied to values. + // // NewIterator // Same as above but also updates read_amp_bitmap_ if it is not nullptr. // @@ -175,13 +181,11 @@ class Block { // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. template - TBlockIter* NewIterator(const Comparator* comparator, - const Comparator* user_comparator, - TBlockIter* iter = nullptr, - Statistics* stats = nullptr, - bool total_order_seek = true, - bool key_includes_seq = true, - BlockPrefixIndex* prefix_index = nullptr); + TBlockIter* NewIterator( + const Comparator* comparator, const Comparator* user_comparator, + TBlockIter* iter = nullptr, Statistics* stats = nullptr, + bool total_order_seek = true, bool key_includes_seq = true, + bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -204,7 +208,8 @@ class Block { void operator=(const Block&) = delete; }; -class BlockIter : public InternalIterator { +template +class BlockIter : public InternalIteratorBase { public: void InitializeBase(const Comparator* comparator, const char* data, uint32_t restarts, uint32_t num_restarts, @@ -243,10 +248,6 @@ class BlockIter : public InternalIterator { assert(Valid()); return key_.GetKey(); } - virtual Slice value() const override { - assert(Valid()); - return value_; - } #ifndef NDEBUG virtual ~BlockIter() { @@ -280,7 +281,8 @@ class BlockIter : public InternalIterator { const char* data_; // underlying block contents uint32_t num_restarts_; // Number of uint32_t entries in restart array - uint32_t restart_index_; // Index of restart block in which current_ falls + // Index of restart block in which current_ or current_-1 falls + uint32_t restart_index_; uint32_t restarts_; // Offset of restart array (list of fixed32) // current_ is offset in data_ of current entry. >= restarts_ if !Valid uint32_t current_; @@ -316,11 +318,12 @@ class BlockIter : public InternalIterator { void CorruptionError(); - bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp); + template + inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, + uint32_t* index, const Comparator* comp); }; -class DataBlockIter final : public BlockIter { +class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} @@ -405,14 +408,14 @@ class DataBlockIter final : public BlockIter { std::vector prev_entries_; int32_t prev_entries_idx_ = -1; - bool ParseNextDataKey(); + inline bool ParseNextDataKey(); inline int Compare(const IterKey& ikey, const Slice& b) const { return comparator_->Compare(ikey.GetInternalKey(), b); } }; -class IndexBlockIter final : public BlockIter { +class IndexBlockIter final : public BlockIter { public: IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} @@ -420,27 +423,46 @@ class IndexBlockIter final : public BlockIter { assert(Valid()); return key_.GetKey(); } + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default ture, means that no delta encoding is + // applied to values. IndexBlockIter(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool block_contents_pinned) + bool value_is_full, bool block_contents_pinned) : IndexBlockIter() { Initialize(comparator, user_comparator, data, restarts, num_restarts, - prefix_index, key_includes_seq, block_contents_pinned); + prefix_index, key_includes_seq, block_contents_pinned, + value_is_full); } void Initialize(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool block_contents_pinned) { + bool value_is_full, bool block_contents_pinned) { InitializeBase(comparator, data, restarts, num_restarts, kDisableGlobalSequenceNumber, block_contents_pinned); key_includes_seq_ = key_includes_seq; active_comparator_ = key_includes_seq_ ? comparator_ : user_comparator; key_.SetIsUserKey(!key_includes_seq_); prefix_index_ = prefix_index; + value_delta_encoded_ = !value_is_full; + } + + virtual BlockHandle value() const override { + assert(Valid()); + if (value_delta_encoded_) { + return decoded_value_; + } else { + BlockHandle handle; + Slice v = value_; + Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v); + assert(decode_s.ok()); + return handle; + } } virtual void Seek(const Slice& target) override; @@ -467,11 +489,25 @@ class IndexBlockIter final : public BlockIter { void Invalidate(Status s) { InvalidateBase(s); } private: + // Key is in InternalKey format + bool key_includes_seq_; + bool value_delta_encoded_; + // key_includes_seq_ ? comparator_ : user_comparator_ + const Comparator* active_comparator_; + BlockPrefixIndex* prefix_index_; + // Whether the value is delta encoded. In that case the value is assumed to be + // BlockHandle. The first value in each restart interval is the full encoded + // BlockHandle; the restart of encoded size part of the BlockHandle. The + // offset of delta encoded BlockHandles is computed by adding the size of + // previous delta encoded values in the same restart interval to the offset of + // the first value in that restart interval. + BlockHandle decoded_value_; + bool PrefixSeek(const Slice& target, uint32_t* index); bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, uint32_t left, uint32_t right, uint32_t* index); - int CompareBlockKey(uint32_t block_index, const Slice& target); + inline int CompareBlockKey(uint32_t block_index, const Slice& target); inline int Compare(const Slice& a, const Slice& b) const { return active_comparator_->Compare(a, b); @@ -481,13 +517,11 @@ class IndexBlockIter final : public BlockIter { return active_comparator_->Compare(ikey.GetKey(), b); } - bool ParseNextIndexKey(); + inline bool ParseNextIndexKey(); - // Key is in InternalKey format - bool key_includes_seq_; - // key_includes_seq_ ? comparator_ : user_comparator_ - const Comparator* active_comparator_; - BlockPrefixIndex* prefix_index_; + // When value_delta_encoded_ is enabled it decodes the value which is assumed + // to be BlockHandle and put it to decoded_value_ + inline void DecodeCurrentValue(uint32_t shared); }; } // namespace rocksdb diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 768fb7566..b0234e7dc 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -63,6 +63,7 @@ namespace { FilterBlockBuilder* CreateFilterBlockBuilder( const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, const BlockBasedTableOptions& table_opt, + const bool use_delta_encoding_for_index_values, PartitionedIndexBuilder* const p_index_builder) { if (table_opt.filter_policy == nullptr) return nullptr; @@ -85,7 +86,7 @@ FilterBlockBuilder* CreateFilterBlockBuilder( return new PartitionedFilterBlockBuilder( mopt.prefix_extractor.get(), table_opt.whole_key_filtering, filter_bits_builder, table_opt.index_block_restart_interval, - p_index_builder, partition_size); + use_delta_encoding_for_index_values, p_index_builder, partition_size); } else { return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), table_opt.whole_key_filtering, @@ -266,6 +267,7 @@ struct BlockBasedTableBuilder::Rep { TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. + const bool use_delta_encoding_for_index_values; std::unique_ptr filter_builder; char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size; @@ -306,6 +308,8 @@ struct BlockBasedTableBuilder::Rep { internal_prefix_transform(_moptions.prefix_extractor.get()), compression_dict(_compression_dict), compression_ctx(_compression_type, _compression_opts), + use_delta_encoding_for_index_values(table_opt.format_version >= 4 && + !table_opt.block_align), compressed_cache_key_prefix_size(0), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( @@ -317,18 +321,21 @@ struct BlockBasedTableBuilder::Rep { if (table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( - &internal_comparator, table_options); + &internal_comparator, use_delta_encoding_for_index_values, + table_options); index_builder.reset(p_index_builder_); } else { index_builder.reset(IndexBuilder::CreateIndexBuilder( table_options.index_type, &internal_comparator, - &this->internal_prefix_transform, table_options)); + &this->internal_prefix_transform, use_delta_encoding_for_index_values, + table_options)); } if (skip_filters) { filter_builder = nullptr; } else { filter_builder.reset(CreateFilterBlockBuilder( - _ioptions, _moptions, table_options, p_index_builder_)); + _ioptions, _moptions, table_options, + use_delta_encoding_for_index_values, p_index_builder_)); } for (auto& collector_factories : *int_tbl_prop_collector_factories) { @@ -793,6 +800,8 @@ void BlockBasedTableBuilder::WritePropertiesBlock( } rep_->props.index_key_is_user_key = !rep_->index_builder->seperator_is_key_plus_seq(); + rep_->props.index_value_is_delta_encoded = + rep_->use_delta_encoding_for_index_values; rep_->props.creation_time = rep_->creation_time; rep_->props.oldest_key_time = rep_->oldest_key_time; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index cd292a3fe..83b722cc9 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -214,7 +214,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable { const InternalKeyComparator* icomparator, IndexReader** index_reader, const PersistentCacheOptions& cache_options, - const int level, const bool index_key_includes_seq) { + const int level, const bool index_key_includes_seq, + const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -225,36 +226,37 @@ class PartitionIndexReader : public IndexReader, public Cleanable { if (s.ok()) { *index_reader = new PartitionIndexReader( table, icomparator, std::move(index_block), ioptions.statistics, - level, index_key_includes_seq); + level, index_key_includes_seq, index_value_is_full); } return s; } // return a two-level iterator: first level is on the partition index - virtual InternalIterator* NewIterator(IndexBlockIter* /*iter*/ = nullptr, - bool /*dont_care*/ = true, - bool fill_cache = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true, + bool fill_cache = true) override { Statistics* kNullStats = nullptr; // Filters are already checked before seeking the index if (!partition_map_.empty()) { return NewTwoLevelIterator( new BlockBasedTable::PartitionedIndexIteratorState( - table_, &partition_map_, index_key_includes_seq_), + table_, &partition_map_, index_key_includes_seq_, + index_value_is_full_), index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_)); + kNullStats, true, index_key_includes_seq_, index_value_is_full_)); } else { auto ro = ReadOptions(); ro.fill_cache = fill_cache; bool kIsIndex = true; - return new BlockBasedTableIterator( + return new BlockBasedTableIterator( table_, ro, *icomparator_, index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_), + kNullStats, true, index_key_includes_seq_, index_value_is_full_), false, true, /* prefix_extractor */ nullptr, kIsIndex, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); } // TODO(myabandeh): Update TwoLevelIterator to be able to make use of // on-stack BlockIter while the state is on heap. Currentlly it assumes @@ -270,7 +272,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { Statistics* kNullStats = nullptr; index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -278,14 +280,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { // Empty index. return; } - Slice input = biter.value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read first index partition"); - return; - } + handle = biter.value(); uint64_t prefetch_off = handle.offset(); // Read the last block's offset @@ -294,36 +289,21 @@ class PartitionIndexReader : public IndexReader, public Cleanable { // Empty index. return; } - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read last index partition"); - return; - } + handle = biter.value(); uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, - static_cast(prefetch_len)); + Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast(prefetch_len)); // After prefetch, read the partitions one by one biter.SeekToFirst(); auto ro = ReadOptions(); Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read index partition"); - continue; - } - + handle = biter.value(); BlockBasedTable::CachableEntry block; Slice compression_dict; if (rep->compression_dict_block) { @@ -374,11 +354,13 @@ class PartitionIndexReader : public IndexReader, public Cleanable { PartitionIndexReader(BlockBasedTable* table, const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const int /*level*/, const bool index_key_includes_seq) + const int /*level*/, const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), table_(table), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } BlockBasedTable* table_; @@ -386,6 +368,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { std::unordered_map> partition_map_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Index that allows binary search lookup for the first key of each block. @@ -404,7 +387,8 @@ class BinarySearchIndexReader : public IndexReader { const InternalKeyComparator* icomparator, IndexReader** index_reader, const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq) { + const bool index_key_includes_seq, + const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -415,19 +399,19 @@ class BinarySearchIndexReader : public IndexReader { if (s.ok()) { *index_reader = new BinarySearchIndexReader( icomparator, std::move(index_block), ioptions.statistics, - index_key_includes_seq); + index_key_includes_seq, index_value_is_full); } return s; } - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool /*dont_care*/ = true, - bool /*dont_care*/ = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true, + bool /*dont_care*/ = true) override { Statistics* kNullStats = nullptr; return index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), iter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); } virtual size_t size() const override { return index_block_->size(); } @@ -449,31 +433,32 @@ class BinarySearchIndexReader : public IndexReader { private: BinarySearchIndexReader(const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, - Statistics* stats, const bool index_key_includes_seq) + Statistics* stats, const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } std::unique_ptr index_block_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Index that leverages an internal hash table to quicken the lookup for a given // key. class HashIndexReader : public IndexReader { public: - static Status Create(const SliceTransform* hash_key_extractor, - const Footer& footer, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, - const BlockHandle& index_handle, - InternalIterator* meta_index_iter, - IndexReader** index_reader, - bool /*hash_index_allow_collision*/, - const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq) { + static Status Create( + const SliceTransform* hash_key_extractor, const Footer& footer, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icomparator, const BlockHandle& index_handle, + InternalIterator* meta_index_iter, IndexReader** index_reader, + bool /*hash_index_allow_collision*/, + const PersistentCacheOptions& cache_options, + const bool index_key_includes_seq, const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -489,9 +474,9 @@ class HashIndexReader : public IndexReader { // hard error. We can still fall back to the original binary search index. // So, Create will succeed regardless, from this point on. - auto new_index_reader = - new HashIndexReader(icomparator, std::move(index_block), - ioptions.statistics, index_key_includes_seq); + auto new_index_reader = new HashIndexReader( + icomparator, std::move(index_block), ioptions.statistics, + index_key_includes_seq, index_value_is_full); *index_reader = new_index_reader; // Get prefixes block @@ -545,13 +530,14 @@ class HashIndexReader : public IndexReader { return Status::OK(); } - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool total_order_seek = true, - bool /*dont_care*/ = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool /*dont_care*/ = true) override { Statistics* kNullStats = nullptr; return index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), iter, kNullStats, - total_order_seek, index_key_includes_seq_, prefix_index_.get()); + total_order_seek, index_key_includes_seq_, index_value_is_full_, + prefix_index_.get()); } virtual size_t size() const override { return index_block_->size(); } @@ -577,10 +563,12 @@ class HashIndexReader : public IndexReader { private: HashIndexReader(const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const bool index_key_includes_seq) + const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } @@ -591,6 +579,7 @@ class HashIndexReader : public IndexReader { std::unique_ptr prefix_index_; BlockContents prefixes_contents_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Helper function to setup the cache key's prefix for the Table. @@ -1030,8 +1019,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, bool disable_prefix_seek = rep->index_type == BlockBasedTableOptions::kHashSearch && need_upper_bound_check; - unique_ptr iter(new_table->NewIndexIterator( - ReadOptions(), disable_prefix_seek, nullptr, &index_entry)); + unique_ptr> iter( + new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek, + nullptr, &index_entry)); s = iter->status(); if (s.ok()) { // This is the first call to NewIndexIterator() since we're in Open(). @@ -1435,7 +1425,9 @@ FilterBlockReader* BlockBasedTable::ReadFilter( rep->whole_key_filtering, std::move(block), nullptr, rep->ioptions.statistics, rep->internal_comparator, this, rep_->table_properties == nullptr || - !rep_->table_properties->index_key_is_user_key); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case Rep::FilterType::kBlockFilter: @@ -1553,7 +1545,7 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( // disable_prefix_seek should be set to true when prefix_extractor found in SST // differs from the one in mutable_cf_options and index type is HashBasedIndex -InternalIterator* BlockBasedTable::NewIndexIterator( +InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* input_iter, CachableEntry* index_entry, GetContext* get_context) { @@ -1592,7 +1584,8 @@ InternalIterator* BlockBasedTable::NewIndexIterator( input_iter->Invalidate(Status::Incomplete("no blocking io")); return input_iter; } else { - return NewErrorInternalIterator(Status::Incomplete("no blocking io")); + return NewErrorInternalIterator( + Status::Incomplete("no blocking io")); } } @@ -1639,7 +1632,7 @@ InternalIterator* BlockBasedTable::NewIndexIterator( input_iter->Invalidate(s); return input_iter; } else { - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } } @@ -1660,21 +1653,6 @@ InternalIterator* BlockBasedTable::NewIndexIterator( return iter; } -template -TBlockIter* BlockBasedTable::NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const Slice& index_value, - TBlockIter* input_iter, bool is_index, bool key_includes_seq, - GetContext* get_context, FilePrefetchBuffer* prefetch_buffer) { - BlockHandle handle; - Slice input = index_value; - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - Status s = handle.DecodeFrom(&input); - return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, - key_includes_seq, get_context, s, - prefetch_buffer); -} - // Convert an index iterator value (i.e., an encoded BlockHandle) // into an iterator over the contents of the corresponding block. // If input_iter is null, new a iterator @@ -1683,7 +1661,8 @@ template TBlockIter* BlockBasedTable::NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, bool is_index, bool key_includes_seq, - GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) { + bool index_key_is_full, GetContext* get_context, Status s, + FilePrefetchBuffer* prefetch_buffer) { PERF_TIMER_GUARD(new_table_block_iter_nanos); const bool no_io = (ro.read_tier == kBlockCacheTier); @@ -1733,7 +1712,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const bool kTotalOrderSeek = true; iter = block.value->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq); + iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, + index_key_is_full); if (block.cache_handle != nullptr) { iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, block.cache_handle); @@ -1848,22 +1828,20 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache( BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( BlockBasedTable* table, std::unordered_map>* block_map, - bool index_key_includes_seq) + bool index_key_includes_seq, bool index_key_is_full) : table_(table), block_map_(block_map), - index_key_includes_seq_(index_key_includes_seq) {} + index_key_includes_seq_(index_key_includes_seq), + index_key_is_full_(index_key_is_full) {} -template -const size_t BlockBasedTableIterator::kMaxReadaheadSize = +template +const size_t BlockBasedTableIterator::kMaxReadaheadSize = 256 * 1024; -InternalIterator* +InternalIteratorBase* BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( - const Slice& index_value) { + const BlockHandle& handle) { // Return a block iterator on the index partition - BlockHandle handle; - Slice input = index_value; - Status s = handle.DecodeFrom(&input); auto rep = table_->get_rep(); auto block = block_map_->find(handle.offset()); // This is a possible scenario since block cache might not have had space @@ -1879,10 +1857,10 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( Statistics* kNullStats = nullptr; return block->second.value->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq_); + nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); } // Create an empty iterator - return new DataBlockIter(); + return new IndexBlockIter(); } // This will be broken if the user specifies an unusual implementation @@ -1955,7 +1933,7 @@ bool BlockBasedTable::PrefixMayMatch( // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - unique_ptr iiter( + unique_ptr> iiter( NewIndexIterator(no_io_read_options, /* need_upper_bound_check */ false)); iiter->Seek(internal_prefix); @@ -1988,10 +1966,7 @@ bool BlockBasedTable::PrefixMayMatch( // after the data block corresponding to iiter->key() cannot // possibly contain the key. Thus, the corresponding data block // is the only on could potentially contain the prefix. - Slice handle_value = iiter->value(); - BlockHandle handle; - s = handle.DecodeFrom(&handle_value); - assert(s.ok()); + BlockHandle handle = iiter->value(); may_match = filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset()); } @@ -2015,8 +1990,8 @@ bool BlockBasedTable::PrefixMayMatch( return may_match; } -template -void BlockBasedTableIterator::Seek(const Slice& target) { +template +void BlockBasedTableIterator::Seek(const Slice& target) { is_out_of_bound_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); @@ -2045,8 +2020,9 @@ void BlockBasedTableIterator::Seek(const Slice& target) { block_iter_.key()) <= 0)); } -template -void BlockBasedTableIterator::SeekForPrev(const Slice& target) { +template +void BlockBasedTableIterator::SeekForPrev( + const Slice& target) { is_out_of_bound_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); @@ -2088,8 +2064,8 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) { icomp_.Compare(target, block_iter_.key()) >= 0); } -template -void BlockBasedTableIterator::SeekToFirst() { +template +void BlockBasedTableIterator::SeekToFirst() { is_out_of_bound_ = false; SavePrevIndexValue(); index_iter_->SeekToFirst(); @@ -2102,8 +2078,8 @@ void BlockBasedTableIterator::SeekToFirst() { FindKeyForward(); } -template -void BlockBasedTableIterator::SeekToLast() { +template +void BlockBasedTableIterator::SeekToLast() { is_out_of_bound_ = false; SavePrevIndexValue(); index_iter_->SeekToLast(); @@ -2116,32 +2092,30 @@ void BlockBasedTableIterator::SeekToLast() { FindKeyBackward(); } -template -void BlockBasedTableIterator::Next() { +template +void BlockBasedTableIterator::Next() { assert(block_iter_points_to_real_block_); block_iter_.Next(); FindKeyForward(); } -template -void BlockBasedTableIterator::Prev() { +template +void BlockBasedTableIterator::Prev() { assert(block_iter_points_to_real_block_); block_iter_.Prev(); FindKeyBackward(); } -template -void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle; - Slice handle_slice = index_iter_->value(); +template +void BlockBasedTableIterator::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value(); if (!block_iter_points_to_real_block_ || - handle_slice.compare(prev_index_value_) != 0 || + data_block_handle.offset() != prev_index_value_.offset() || // if previous attempt of reading the block missed cache, try again block_iter_.status().IsIncomplete()) { if (block_iter_points_to_real_block_) { ResetDataIter(); } - Status s = data_block_handle.DecodeFrom(&handle_slice); auto* rep = table_->get_rep(); // Automatically prefetch additional data when a range scan (iterator) does @@ -2173,16 +2147,17 @@ void BlockBasedTableIterator::InitDataBlock() { } } + Status s; BlockBasedTable::NewDataBlockIterator( rep, read_options_, data_block_handle, &block_iter_, is_index_, - key_includes_seq_, + key_includes_seq_, index_key_is_full_, /* get_context */ nullptr, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; } } -template -void BlockBasedTableIterator::FindKeyForward() { +template +void BlockBasedTableIterator::FindKeyForward() { assert(!is_out_of_bound_); // TODO the while loop inherits from two-level-iterator. We don't know // whether a block can be empty so it can be replaced by an "if". @@ -2221,8 +2196,8 @@ void BlockBasedTableIterator::FindKeyForward() { } } -template -void BlockBasedTableIterator::FindKeyBackward() { +template +void BlockBasedTableIterator::FindKeyBackward() { assert(!is_out_of_bound_); while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { @@ -2297,11 +2272,10 @@ InternalIterator* BlockBasedTable::NewRangeTombstoneIterator( return iter; } } - std::string str; - rep_->range_del_handle.EncodeTo(&str); // The meta-block exists but isn't in uncompressed block cache (maybe // because it is disabled), so go through the full lookup process. - return NewDataBlockIterator(rep_, read_options, Slice(str)); + return NewDataBlockIterator(rep_, read_options, + rep_->range_del_handle); } bool BlockBasedTable::FullFilterKeyMayMatch( @@ -2364,7 +2338,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, /* index_entry */ nullptr, get_context); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); } @@ -2372,12 +2346,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool matched = false; // if such user key mathced a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - Slice handle_value = iiter->value(); + BlockHandle handle = iiter->value(); - BlockHandle handle; bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && - handle.DecodeFrom(&handle_value).ok() && !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, handle.offset(), no_io); @@ -2455,9 +2427,10 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, IndexBlockIter iiter_on_stack; auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = std::unique_ptr(iiter); + iiter_unique_ptr = + std::unique_ptr>(iiter); } if (!iiter->status().ok()) { @@ -2470,7 +2443,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { - Slice block_handle = iiter->value(); + BlockHandle block_handle = iiter->value(); const bool is_user_key = rep_->table_properties && rep_->table_properties->index_key_is_user_key > 0; if (end && @@ -2516,11 +2489,12 @@ Status BlockBasedTable::VerifyChecksum() { } // Check Data blocks IndexBlockIter iiter_on_stack; - InternalIterator* iiter = + InternalIteratorBase* iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = std::unique_ptr(iiter); + iiter_unique_ptr = + std::unique_ptr>(iiter); } if (!iiter->status().ok()) { // error opening index iterator @@ -2530,19 +2504,41 @@ Status BlockBasedTable::VerifyChecksum() { return s; } -Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) { +Status BlockBasedTable::VerifyChecksumInBlocks( + InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); if (!s.ok()) { break; } - BlockHandle handle; - Slice input = index_iter->value(); - s = handle.DecodeFrom(&input); + BlockHandle handle = index_iter->value(); + BlockContents contents; + Slice dummy_comp_dict; + BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */, + rep_->footer, ReadOptions(), handle, &contents, + rep_->ioptions, false /* decompress */, + dummy_comp_dict /*compression dict*/, + rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + break; + } + } + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks( + InternalIteratorBase* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); if (!s.ok()) { break; } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); BlockContents contents; Slice dummy_comp_dict; BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */, @@ -2560,15 +2556,13 @@ Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) { bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - std::unique_ptr iiter(NewIndexIterator(options)); + std::unique_ptr> iiter( + NewIndexIterator(options)); iiter->Seek(key); assert(iiter->Valid()); CachableEntry block; - BlockHandle handle; - Slice input = iiter->value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); + BlockHandle handle = iiter->value(); Cache* block_cache = rep_->table_options.block_cache.get(); assert(block_cache != nullptr); @@ -2578,6 +2572,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, cache_key_storage); Slice ckey; + Status s; s = GetDataBlockFromCache( cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block, rep_->table_options.format_version, @@ -2638,14 +2633,18 @@ Status BlockBasedTable::CreateIndexReader( rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, level, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case BlockBasedTableOptions::kBinarySearch: { return BinarySearchIndexReader::Create( file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -2665,7 +2664,9 @@ Status BlockBasedTable::CreateIndexReader( rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } meta_index_iter = meta_iter_guard.get(); } @@ -2676,7 +2677,9 @@ Status BlockBasedTable::CreateIndexReader( index_reader, rep_->hash_index_allow_collision, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } default: { std::string error_message = @@ -2687,22 +2690,14 @@ Status BlockBasedTable::CreateIndexReader( } uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { - unique_ptr index_iter(NewIndexIterator(ReadOptions())); + unique_ptr> index_iter( + NewIndexIterator(ReadOptions())); index_iter->Seek(key); uint64_t result; if (index_iter->Valid()) { - BlockHandle handle; - Slice input = index_iter->value(); - Status s = handle.DecodeFrom(&input); - if (s.ok()) { - result = handle.offset(); - } else { - // Strange: we can't decode the block handle in the index block. - // We'll just return the offset of the metaindex block, which is - // close to the whole file size for this case. - result = rep_->footer.metaindex_handle().offset(); - } + BlockHandle handle = index_iter->value(); + result = handle.offset(); } else { // key is past the last key in the file. If table_properties is not // available, approximate the offset by returning the offset of the @@ -2729,7 +2724,7 @@ bool BlockBasedTable::TEST_index_reader_preloaded() const { Status BlockBasedTable::GetKVPairsFromDataBlocks( std::vector* kv_pair_blocks) { - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); @@ -2944,7 +2939,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append( "Index Details:\n" "--------------------------------------\n"); - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -2993,7 +2988,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { } Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -3013,9 +3008,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { break; } - Slice bh_val = blockhandles_iter->value(); - BlockHandle bh; - bh.DecodeFrom(&bh_val); + BlockHandle bh = blockhandles_iter->value(); uint64_t datablock_size = bh.size(); datablock_size_min = std::min(datablock_size_min, datablock_size); datablock_size_max = std::max(datablock_size_max, datablock_size); diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 7e7b41a71..3cada0c2c 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -51,7 +51,6 @@ struct BlockBasedTableOptions; struct EnvOptions; struct ReadOptions; class GetContext; -class InternalIterator; using std::unique_ptr; @@ -178,9 +177,9 @@ class BlockBasedTable : public TableReader { // to // a different object then iter and the callee has the ownership of the // returned object. - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool total_order_seek = true, - bool fill_cache = true) = 0; + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool fill_cache = true) = 0; // The size of the index. virtual size_t size() const = 0; @@ -224,14 +223,16 @@ class BlockBasedTable : public TableReader { static TBlockIter* NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const Slice& index_value, TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, GetContext* get_context = nullptr, + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr); template static TBlockIter* NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, GetContext* get_context = nullptr, - Status s = Status(), FilePrefetchBuffer* prefetch_buffer = nullptr); + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, Status s = Status(), + FilePrefetchBuffer* prefetch_buffer = nullptr); class PartitionedIndexIteratorState; @@ -284,7 +285,7 @@ class BlockBasedTable : public TableReader { // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - InternalIterator* NewIndexIterator( + InternalIteratorBase* NewIndexIterator( const ReadOptions& read_options, bool need_upper_bound_check = false, IndexBlockIter* input_iter = nullptr, CachableEntry* index_entry = nullptr, @@ -353,7 +354,8 @@ class BlockBasedTable : public TableReader { std::unique_ptr* meta_block, std::unique_ptr* iter); - Status VerifyChecksumInBlocks(InternalIterator* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); // Create the filter from the filter block. virtual FilterBlockReader* ReadFilter( @@ -390,14 +392,16 @@ class BlockBasedTable::PartitionedIndexIteratorState PartitionedIndexIteratorState( BlockBasedTable* table, std::unordered_map>* block_map, - const bool index_key_includes_seq); - InternalIterator* NewSecondaryIterator(const Slice& index_value) override; + const bool index_key_includes_seq, const bool index_key_is_full); + InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& index_value) override; private: // Don't own table_ BlockBasedTable* table_; std::unordered_map>* block_map_; bool index_key_includes_seq_; + bool index_key_is_full_; }; // CachableEntry represents the entries that *may* be fetched from block cache. @@ -521,16 +525,17 @@ struct BlockBasedTable::Rep { const bool immortal_table; }; -template -class BlockBasedTableIterator : public InternalIterator { +template +class BlockBasedTableIterator : public InternalIteratorBase { public: BlockBasedTableIterator(BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, - InternalIterator* index_iter, bool check_filter, - bool need_upper_bound_check, + InternalIteratorBase* index_iter, + bool check_filter, bool need_upper_bound_check, const SliceTransform* prefix_extractor, bool is_index, bool key_includes_seq = true, + bool index_key_is_full = true, bool for_compaction = false) : table_(table), read_options_(read_options), @@ -543,6 +548,7 @@ class BlockBasedTableIterator : public InternalIterator { prefix_extractor_(prefix_extractor), is_index_(is_index), key_includes_seq_(key_includes_seq), + index_key_is_full_(index_key_is_full), for_compaction_(for_compaction) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -561,7 +567,7 @@ class BlockBasedTableIterator : public InternalIterator { assert(Valid()); return block_iter_.key(); } - Slice value() const override { + TValue value() const override { assert(Valid()); return block_iter_.value(); } @@ -618,8 +624,7 @@ class BlockBasedTableIterator : public InternalIterator { if (block_iter_points_to_real_block_) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. - Slice v = index_iter_->value(); - prev_index_value_.assign(v.data(), v.size()); + prev_index_value_ = index_iter_->value(); } } @@ -631,7 +636,7 @@ class BlockBasedTableIterator : public InternalIterator { BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; - InternalIterator* index_iter_; + InternalIteratorBase* index_iter_; PinnedIteratorsManager* pinned_iters_mgr_; TBlockIter block_iter_; bool block_iter_points_to_real_block_; @@ -644,10 +649,10 @@ class BlockBasedTableIterator : public InternalIterator { bool is_index_; // If the keys in the blocks over which we iterate include 8 byte sequence bool key_includes_seq_; + bool index_key_is_full_; // If this iterator is created for compaction bool for_compaction_; - // TODO use block offset instead - std::string prev_index_value_; + BlockHandle prev_index_value_; static const size_t kInitReadaheadSize = 8 * 1024; // Found that 256 KB readahead size provides the best performance, based on diff --git a/table/block_builder.cc b/table/block_builder.cc index 39bfffe51..ba4ef09ec 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -41,9 +41,11 @@ namespace rocksdb { -BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding) +BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding, + bool use_value_delta_encoding) : block_restart_interval_(block_restart_interval), use_delta_encoding_(use_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), restarts_(), counter_(0), finished_(false) { @@ -65,14 +67,27 @@ void BlockBuilder::Reset() { size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) const { size_t estimate = CurrentSizeEstimate(); - estimate += key.size() + value.size(); + // Note: this is an imprecise estimate as it accounts for the whole key size + // instead of non-shared key size. + estimate += key.size(); + // In value delta encoding we estimate the value delta size as half the full + // value size since only the size field of block handle is encoded. + estimate += + !use_value_delta_encoding_ || (counter_ >= block_restart_interval_) + ? value.size() + : value.size() / 2; + if (counter_ >= block_restart_interval_) { estimate += sizeof(uint32_t); // a new restart entry. } estimate += sizeof(int32_t); // varint for shared prefix length. + // Note: this is an imprecise estimate as we will have to encoded size, one + // for shared key and one for non-shared key. estimate += VarintLength(key.size()); // varint for key length. - estimate += VarintLength(value.size()); // varint for value length. + if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) { + estimate += VarintLength(value.size()); // varint for value length. + } return estimate; } @@ -87,9 +102,11 @@ Slice BlockBuilder::Finish() { return Slice(buffer_); } -void BlockBuilder::Add(const Slice& key, const Slice& value) { +void BlockBuilder::Add(const Slice& key, const Slice& value, + const Slice* const delta_value) { assert(!finished_); assert(counter_ <= block_restart_interval_); + assert(!use_value_delta_encoding_ || delta_value); size_t shared = 0; // number of bytes shared with prev key if (counter_ >= block_restart_interval_) { // Restart compression @@ -115,14 +132,27 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) { const size_t non_shared = key.size() - shared; const size_t curr_size = buffer_.size(); - // Add "" to buffer_ - PutVarint32Varint32Varint32(&buffer_, static_cast(shared), - static_cast(non_shared), - static_cast(value.size())); + if (use_value_delta_encoding_) { + // Add "" to buffer_ + PutVarint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared)); + } else { + // Add "" to buffer_ + PutVarint32Varint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared), + static_cast(value.size())); + } // Add string delta to buffer_ followed by value buffer_.append(key.data() + shared, non_shared); - buffer_.append(value.data(), value.size()); + // Use value delta encoding only when the key has shared bytes. This would + // simplify the decoding, where it can figure which decoding to use simply by + // looking at the shared bytes size. + if (shared != 0 && use_value_delta_encoding_) { + buffer_.append(delta_value->data(), delta_value->size()); + } else { + buffer_.append(value.data(), value.size()); + } counter_++; estimate_ += buffer_.size() - curr_size; diff --git a/table/block_builder.h b/table/block_builder.h index 6b5297d04..f2be4c020 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -21,14 +21,16 @@ class BlockBuilder { void operator=(const BlockBuilder&) = delete; explicit BlockBuilder(int block_restart_interval, - bool use_delta_encoding = true); + bool use_delta_encoding = true, + bool use_value_delta_encoding = false); // Reset the contents as if the BlockBuilder was just constructed. void Reset(); // REQUIRES: Finish() has not been called since the last call to Reset(). // REQUIRES: key is larger than any previously added key - void Add(const Slice& key, const Slice& value); + void Add(const Slice& key, const Slice& value, + const Slice* const delta_value = nullptr); // Finish building the block and return a slice that refers to the // block contents. The returned slice will remain valid for the @@ -49,7 +51,10 @@ class BlockBuilder { private: const int block_restart_interval_; + //TODO(myabandeh): put it into a separate IndexBlockBuilder const bool use_delta_encoding_; + // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values + const bool use_value_delta_encoding_; std::string buffer_; // Destination buffer std::vector restarts_; // Restart points diff --git a/table/block_test.cc b/table/block_test.cc index 0ed450f07..009740a28 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -68,6 +68,29 @@ void GenerateRandomKVs(std::vector *keys, } } +// Same as GenerateRandomKVs but the values are BlockHandle +void GenerateRandomKBHs(std::vector *keys, + std::vector *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + uint64_t offset = 0; + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generate keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + values->emplace_back(handle); + } + } +} + class BlockTest : public testing::Test {}; // block test @@ -131,6 +154,84 @@ TEST_F(BlockTest, SimpleTest) { delete iter; } +TEST_F(BlockTest, ValueDeltaEncodingTest) { + Random rnd(301); + Options options = Options(); + std::unique_ptr ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector keys; + std::vector values; + const bool kUseDeltaEncoding = true; + const bool kUseValueDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding); + int num_records = 100; + + GenerateRandomKBHs(&keys, &values, 0, num_records); + // add a bunch of records to a block + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + auto block_handle = values[i]; + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle.size()); + last_encoded_handle = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !kUseValueDeltaEncoding; + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + int count = 0; + InternalIteratorBase *iter = reader.NewIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { + // read kv from block + Slice k = iter->key(); + BlockHandle handle = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + + ASSERT_EQ(values[count].offset(), handle.offset()); + ASSERT_EQ(values[count].size(), handle.size()); + } + delete iter; + + // read block contents randomly + iter = reader.NewIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + BlockHandle handle = iter->value(); + ASSERT_EQ(values[index].offset(), handle.offset()); + ASSERT_EQ(values[index].size(), handle.size()); + } + delete iter; +} // return the block contents BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index d008ff1a3..fb14b1759 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -374,15 +374,12 @@ Slice CuckooTableIterator::value() const { return curr_value_; } -extern InternalIterator* NewErrorInternalIterator(const Status& status, - Arena* arena); - InternalIterator* CuckooTableReader::NewIterator( const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { if (!status().ok()) { - return NewErrorInternalIterator( + return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); } CuckooTableIterator* iter; diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index f1539f09f..b37d46373 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -25,7 +25,6 @@ namespace rocksdb { class Arena; class TableReader; -class InternalIterator; class CuckooTableReader: public TableReader { public: diff --git a/table/format.cc b/table/format.cc index 1f8d7c3a6..a4e448870 100644 --- a/table/format.cc +++ b/table/format.cc @@ -54,6 +54,13 @@ void BlockHandle::EncodeTo(std::string* dst) const { PutVarint64Varint64(dst, offset_, size_); } +void BlockHandle::EncodeSizeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, size_); +} + Status BlockHandle::DecodeFrom(Slice* input) { if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { @@ -66,6 +73,18 @@ Status BlockHandle::DecodeFrom(Slice* input) { } } +Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { + if (GetVarint64(input, &size_)) { + offset_ = _offset; + return Status::OK(); + } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; + return Status::Corruption("bad block handle"); + } +} + // Return a string that contains the copy of handle. std::string BlockHandle::ToString(bool hex) const { std::string handle_str; diff --git a/table/format.h b/table/format.h index fd8459fdc..ebc9c2539 100644 --- a/table/format.h +++ b/table/format.h @@ -54,6 +54,8 @@ class BlockHandle { void EncodeTo(std::string* dst) const; Status DecodeFrom(Slice* input); + Status DecodeSizeFrom(uint64_t offset, Slice* input); + void EncodeSizeTo(std::string* dst) const; // Return a string that contains the copy of handle. std::string ToString(bool hex = true) const; @@ -90,7 +92,7 @@ inline uint32_t GetCompressFormatForVersion( } inline bool BlockBasedTableSupportedVersion(uint32_t version) { - return version <= 3; + return version <= 4; } // Footer encapsulates the fixed information stored at the tail diff --git a/table/index_builder.cc b/table/index_builder.cc index ebabbeb8d..6b8114f3e 100644 --- a/table/index_builder.cc +++ b/table/index_builder.cc @@ -27,23 +27,26 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( BlockBasedTableOptions::IndexType index_type, const InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt) { IndexBuilder* result = nullptr; switch (index_type) { case BlockBasedTableOptions::kBinarySearch: { - result = new ShortenedIndexBuilder(comparator, - table_opt.index_block_restart_interval, - table_opt.format_version); + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding); } break; case BlockBasedTableOptions::kHashSearch: { result = new HashIndexBuilder(comparator, int_key_slice_transform, table_opt.index_block_restart_interval, - table_opt.format_version); + table_opt.format_version, + use_value_delta_encoding); } break; case BlockBasedTableOptions::kTwoLevelIndexSearch: { - result = PartitionedIndexBuilder::CreateIndexBuilder(comparator, table_opt); + result = PartitionedIndexBuilder::CreateIndexBuilder( + comparator, use_value_delta_encoding, table_opt); } break; default: { @@ -56,21 +59,27 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( const InternalKeyComparator* comparator, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt) { - return new PartitionedIndexBuilder(comparator, table_opt); + return new PartitionedIndexBuilder(comparator, table_opt, + use_value_delta_encoding); } PartitionedIndexBuilder::PartitionedIndexBuilder( const InternalKeyComparator* comparator, - const BlockBasedTableOptions& table_opt) + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding) : IndexBuilder(comparator), index_block_builder_(table_opt.index_block_restart_interval, - table_opt.format_version), + true /*use_delta_encoding*/, + use_value_delta_encoding), index_block_builder_without_seq_(table_opt.index_block_restart_interval, - table_opt.format_version), + true /*use_delta_encoding*/, + use_value_delta_encoding), sub_index_builder_(nullptr), table_opt_(table_opt), - seperator_is_key_plus_seq_(false) {} + seperator_is_key_plus_seq_(false), + use_value_delta_encoding_(use_value_delta_encoding) {} PartitionedIndexBuilder::~PartitionedIndexBuilder() { delete sub_index_builder_; @@ -80,7 +89,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { assert(sub_index_builder_ == nullptr); sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, - table_opt_.format_version); + table_opt_.format_version, use_value_delta_encoding_); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, sub_index_builder_->index_block_builder_)); @@ -149,10 +158,18 @@ Status PartitionedIndexBuilder::Finish( Entry& last_entry = entries_.front(); std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); - index_block_builder_.Add(last_entry.key, handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); if (!seperator_is_key_plus_seq_) { index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), - handle_encoding); + handle_encoding, + &handle_delta_encoding_slice); } entries_.pop_front(); } @@ -202,9 +219,12 @@ size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize( uint64_t size = it->value->EstimatedSize(); BlockHandle tmp_block_handle(offset, size); tmp_block_handle.EncodeTo(&tmp_handle_encoding); + std::string handle_delta_encoding; + tmp_block_handle.EncodeSizeTo(&handle_delta_encoding); + const Slice handle_delta_encoding_slice(handle_delta_encoding); tmp_builder.Add( seperator_is_key_plus_seq_ ? it->key : ExtractUserKey(it->key), - tmp_handle_encoding); + tmp_handle_encoding, &handle_delta_encoding_slice); offset += size; } return tmp_builder.CurrentSizeEstimate(); diff --git a/table/index_builder.h b/table/index_builder.h index e8c060716..bc3a2bd54 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -38,6 +38,7 @@ class IndexBuilder { BlockBasedTableOptions::IndexType index_type, const rocksdb::InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt); // Index builder will construct a set of blocks which contain: @@ -117,11 +118,16 @@ class IndexBuilder { class ShortenedIndexBuilder : public IndexBuilder { public: explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, - int index_block_restart_interval, - uint32_t format_version) + const int index_block_restart_interval, + const uint32_t format_version, + const bool use_value_delta_encoding) : IndexBuilder(comparator), - index_block_builder_(index_block_restart_interval), - index_block_builder_without_seq_(index_block_restart_interval) { + index_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding) { // Making the default true will disable the feature for old versions seperator_is_key_plus_seq_ = (format_version <= 2); } @@ -145,10 +151,17 @@ class ShortenedIndexBuilder : public IndexBuilder { std::string handle_encoding; block_handle.EncodeTo(&handle_encoding); - index_block_builder_.Add(sep, handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle_.size()); + assert(handle_delta_encoding.size() != 0); + last_encoded_handle_ = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(sep, handle_encoding, + &handle_delta_encoding_slice); if (!seperator_is_key_plus_seq_) { - index_block_builder_without_seq_.Add(ExtractUserKey(sep), - handle_encoding); + index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, + &handle_delta_encoding_slice); } } @@ -183,6 +196,7 @@ class ShortenedIndexBuilder : public IndexBuilder { BlockBuilder index_block_builder_; BlockBuilder index_block_builder_without_seq_; bool seperator_is_key_plus_seq_; + BlockHandle last_encoded_handle_; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -217,10 +231,10 @@ class HashIndexBuilder : public IndexBuilder { explicit HashIndexBuilder(const InternalKeyComparator* comparator, const SliceTransform* hash_key_extractor, int index_block_restart_interval, - int format_version) + int format_version, bool use_value_delta_encoding) : IndexBuilder(comparator), primary_index_builder_(comparator, index_block_restart_interval, - format_version), + format_version, use_value_delta_encoding), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, @@ -323,10 +337,12 @@ class PartitionedIndexBuilder : public IndexBuilder { public: static PartitionedIndexBuilder* CreateIndexBuilder( const rocksdb::InternalKeyComparator* comparator, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt); explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, - const BlockBasedTableOptions& table_opt); + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding); virtual ~PartitionedIndexBuilder(); @@ -361,6 +377,8 @@ class PartitionedIndexBuilder : public IndexBuilder { return seperator_is_key_plus_seq_; } + bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } + private: void MakeNewSubIndexBuilder(); @@ -380,10 +398,12 @@ class PartitionedIndexBuilder : public IndexBuilder { bool finishing_indexes = false; const BlockBasedTableOptions& table_opt_; bool seperator_is_key_plus_seq_; + bool use_value_delta_encoding_; // true if an external entity (such as filter partition builder) request // cutting the next partition bool partition_cut_requested_ = true; // true if it should cut the next filter partition block bool cut_filter_block = false; + BlockHandle last_encoded_handle_; }; } // namespace rocksdb diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 2fdb14c7f..a173d6069 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -10,15 +10,17 @@ #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" +#include "table/format.h" namespace rocksdb { class PinnedIteratorsManager; -class InternalIterator : public Cleanable { +template +class InternalIteratorBase : public Cleanable { public: - InternalIterator() {} - virtual ~InternalIterator() {} + InternalIteratorBase() {} + virtual ~InternalIteratorBase() {} // An iterator is either positioned at a key/value pair, or // not valid. This method returns true iff the iterator is valid. @@ -66,7 +68,7 @@ class InternalIterator : public Cleanable { // the returned slice is valid only until the next modification of // the iterator. // REQUIRES: Valid() - virtual Slice value() const = 0; + virtual TValue value() const = 0; // If an error has occurred, return it. Else return an ok status. // If non-blocking IO is requested and this operation cannot be @@ -117,14 +119,24 @@ class InternalIterator : public Cleanable { private: // No copying allowed - InternalIterator(const InternalIterator&) = delete; - InternalIterator& operator=(const InternalIterator&) = delete; + InternalIteratorBase(const InternalIteratorBase&) = delete; + InternalIteratorBase& operator=(const InternalIteratorBase&) = delete; }; +using InternalIterator = InternalIteratorBase; + // Return an empty iterator (yields nothing). -extern InternalIterator* NewEmptyInternalIterator(); +template +extern InternalIteratorBase* NewEmptyInternalIterator(); // Return an empty iterator with the specified status. -extern InternalIterator* NewErrorInternalIterator(const Status& status); +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +// Return an empty iterator with the specified status, allocated arena. +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); } // namespace rocksdb diff --git a/table/iterator.cc b/table/iterator.cc index 0411b374a..97c47fb28 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -131,7 +131,8 @@ class EmptyIterator : public Iterator { Status status_; }; -class EmptyInternalIterator : public InternalIterator { +template +class EmptyInternalIterator : public InternalIteratorBase { public: explicit EmptyInternalIterator(const Status& s) : status_(s) {} virtual bool Valid() const override { return false; } @@ -145,9 +146,9 @@ class EmptyInternalIterator : public InternalIterator { assert(false); return Slice(); } - Slice value() const override { + TValue value() const override { assert(false); - return Slice(); + return TValue(); } virtual Status status() const override { return status_; } @@ -164,30 +165,48 @@ Iterator* NewErrorIterator(const Status& status) { return new EmptyIterator(status); } -InternalIterator* NewEmptyInternalIterator() { - return new EmptyInternalIterator(Status::OK()); +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status) { + return new EmptyInternalIterator(status); } - -InternalIterator* NewEmptyInternalIterator(Arena* arena) { +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status, + Arena* arena) { if (arena == nullptr) { - return NewEmptyInternalIterator(); + return NewErrorInternalIterator(status); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyInternalIterator(Status::OK()); + return new (mem) EmptyInternalIterator(status); } } - -InternalIterator* NewErrorInternalIterator(const Status& status) { - return new EmptyInternalIterator(status); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); + +template +InternalIteratorBase* NewEmptyInternalIterator() { + return new EmptyInternalIterator(Status::OK()); } +template InternalIteratorBase* NewEmptyInternalIterator(); +template InternalIteratorBase* NewEmptyInternalIterator(); -InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena) { +template +InternalIteratorBase* NewEmptyInternalIterator(Arena* arena) { if (arena == nullptr) { - return NewErrorInternalIterator(status); + return NewEmptyInternalIterator(); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyInternalIterator(status); + return new (mem) EmptyInternalIterator(Status::OK()); } } +template InternalIteratorBase* NewEmptyInternalIterator( + Arena* arena); +template InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); } // namespace rocksdb diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index 5ddea2470..5941b846a 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -19,19 +19,21 @@ namespace rocksdb { // the valid() and key() results for an underlying iterator. // This can help avoid virtual function calls and also gives better // cache locality. -class IteratorWrapper { +template +class IteratorWrapperBase { public: - IteratorWrapper() : iter_(nullptr), valid_(false) {} - explicit IteratorWrapper(InternalIterator* _iter) : iter_(nullptr) { + IteratorWrapperBase() : iter_(nullptr), valid_(false) {} + explicit IteratorWrapperBase(InternalIteratorBase* _iter) + : iter_(nullptr) { Set(_iter); } - ~IteratorWrapper() {} - InternalIterator* iter() const { return iter_; } + ~IteratorWrapperBase() {} + InternalIteratorBase* iter() const { return iter_; } // Set the underlying Iterator to _iter and return // previous underlying Iterator. - InternalIterator* Set(InternalIterator* _iter) { - InternalIterator* old_iter = iter_; + InternalIteratorBase* Set(InternalIteratorBase* _iter) { + InternalIteratorBase* old_iter = iter_; iter_ = _iter; if (iter_ == nullptr) { @@ -47,7 +49,7 @@ class IteratorWrapper { if (!is_arena_mode) { delete iter_; } else { - iter_->~InternalIterator(); + iter_->~InternalIteratorBase(); } } } @@ -55,7 +57,10 @@ class IteratorWrapper { // Iterator interface methods bool Valid() const { return valid_; } Slice key() const { assert(Valid()); return key_; } - Slice value() const { assert(Valid()); return iter_->value(); } + TValue value() const { + assert(Valid()); + return iter_->value(); + } // Methods below require iter() != nullptr Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); iter_->Next(); Update(); } @@ -91,17 +96,16 @@ class IteratorWrapper { } } - InternalIterator* iter_; + InternalIteratorBase* iter_; bool valid_; Slice key_; }; +using IteratorWrapper = IteratorWrapperBase; + class Arena; // Return an empty iterator (yields nothing) allocated from arena. -extern InternalIterator* NewEmptyInternalIterator(Arena* arena); - -// Return an empty iterator with the specified status, allocated arena. -extern InternalIterator* NewErrorInternalIterator(const Status& status, - Arena* arena); +template +extern InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); } // namespace rocksdb diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index d0bd0f836..744de37da 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -387,7 +387,7 @@ InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp, Arena* arena, bool prefix_seek_mode) { assert(n >= 0); if (n == 0) { - return NewEmptyInternalIterator(arena); + return NewEmptyInternalIterator(arena); } else if (n == 1) { return list[0]; } else { diff --git a/table/merging_iterator.h b/table/merging_iterator.h index 04fcf421d..21ff79bf6 100644 --- a/table/merging_iterator.h +++ b/table/merging_iterator.h @@ -15,9 +15,11 @@ namespace rocksdb { class Comparator; -class InternalIterator; class Env; class Arena; +template +class InternalIteratorBase; +using InternalIterator = InternalIteratorBase; // Return an iterator that provided the union of the data in // children[0,n-1]. Takes ownership of the child iterators and diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index ba25c58c8..256730bfa 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -76,6 +76,8 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size); } Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); + Add(TablePropertiesNames::kIndexValueIsDeltaEncoded, + props.index_value_is_delta_encoded); Add(TablePropertiesNames::kNumEntries, props.num_entries); Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); @@ -218,6 +220,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, &new_table_properties->top_level_index_size}, {TablePropertiesNames::kIndexKeyIsUserKey, &new_table_properties->index_key_is_user_key}, + {TablePropertiesNames::kIndexValueIsDeltaEncoded, + &new_table_properties->index_value_is_delta_encoded}, {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, {TablePropertiesNames::kRawValueSize, diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 4fceb1a63..a18c8edc4 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -27,7 +27,6 @@ class Footer; class Logger; class RandomAccessFile; struct TableProperties; -class InternalIterator; class MetaIndexBuilder { public: diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index ec0ac7bed..6084133b7 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -26,12 +26,17 @@ namespace rocksdb { PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size) : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering, filter_bits_builder), - index_on_filter_block_builder_(index_block_restart_interval), - index_on_filter_block_builder_without_seq_(index_block_restart_interval), + index_on_filter_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_on_filter_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), p_index_builder_(p_index_builder), filters_in_partition_(0), num_added_(0) { @@ -73,10 +78,15 @@ Slice PartitionedFilterBlockBuilder::Finish( FilterEntry& last_entry = filters.front(); std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); - index_on_filter_block_builder_.Add(last_entry.key, handle_encoding); + std::string handle_delta_encoding; + last_partition_block_handle.EncodeSizeTo(&handle_delta_encoding); + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); if (!p_index_builder_->seperator_is_key_plus_seq()) { index_on_filter_block_builder_without_seq_.Add( - ExtractUserKey(last_entry.key), handle_encoding); + ExtractUserKey(last_entry.key), handle_encoding, + &handle_delta_encoding_slice); } filters.pop_front(); } else { @@ -109,12 +119,14 @@ PartitionedFilterBlockReader::PartitionedFilterBlockReader( const SliceTransform* prefix_extractor, bool _whole_key_filtering, BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq) + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full) : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), prefix_extractor_(prefix_extractor), comparator_(comparator), table_(table), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { idx_on_fltr_blk_.reset(new Block(std::move(contents), kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, stats)); @@ -134,15 +146,10 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { - auto input = biter.value(); - auto s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - continue; - } + handle = biter.value(); auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, table_->rep_->cache_key_prefix_size, handle, cache_key); @@ -168,7 +175,7 @@ bool PartitionedFilterBlockReader::KeyMayMatch( } bool cached = false; auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io, + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, &cached, prefix_extractor); if (UNLIKELY(!filter_partition.value)) { return true; @@ -207,7 +214,7 @@ bool PartitionedFilterBlockReader::PrefixMayMatch( } bool cached = false; auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io, + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, &cached, prefix_extractor); if (UNLIKELY(!filter_partition.value)) { return true; @@ -225,29 +232,26 @@ bool PartitionedFilterBlockReader::PrefixMayMatch( return res; } -Slice PartitionedFilterBlockReader::GetFilterPartitionHandle( +BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const Slice& entry) { IndexBlockIter iter; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { - return Slice(); + return BlockHandle(0, 0); } assert(iter.Valid()); - Slice handle_value = iter.value(); - return handle_value; + BlockHandle fltr_blk_handle = iter.value(); + return fltr_blk_handle; } BlockBasedTable::CachableEntry PartitionedFilterBlockReader::GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, Slice* handle_value, const bool no_io, - bool* cached, const SliceTransform* prefix_extractor) { - BlockHandle fltr_blk_handle; - auto s = fltr_blk_handle.DecodeFrom(handle_value); - assert(s.ok()); + FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, + const bool no_io, bool* cached, const SliceTransform* prefix_extractor) { const bool is_a_filter_partition = true; auto block_cache = table_->rep_->table_options.block_cache.get(); if (LIKELY(block_cache != nullptr)) { @@ -299,39 +303,25 @@ void PartitionedFilterBlockReader::CacheDependencies( // Before read partitions, prefetch them to avoid lots of IOs auto rep = table_->rep_; IndexBlockIter biter; - BlockHandle handle; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); - Slice input = biter.value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read first index partition"); - return; - } + BlockHandle handle = biter.value(); uint64_t prefetch_off = handle.offset(); // Read the last block's offset biter.SeekToLast(); - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read last index partition"); - return; - } + handle = biter.value(); uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); + Status s; s = prefetch_buffer->Prefetch(file.get(), prefetch_off, static_cast(prefetch_len)); @@ -339,14 +329,7 @@ void PartitionedFilterBlockReader::CacheDependencies( biter.SeekToFirst(); Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition"); - continue; - } - + handle = biter.value(); const bool no_io = true; const bool is_a_filter_partition = true; auto filter = table_->GetFilter( diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h index 86ec038a9..f6241749d 100644 --- a/table/partitioned_filter_block.h +++ b/table/partitioned_filter_block.h @@ -26,6 +26,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { explicit PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size); @@ -74,7 +75,8 @@ class PartitionedFilterBlockReader : public FilterBlockReader, const SliceTransform* prefix_extractor, bool whole_key_filtering, BlockContents&& contents, FilterBitsReader* filter_bits_reader, Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq); + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full); virtual ~PartitionedFilterBlockReader(); virtual bool IsBlockBased() override { return false; } @@ -89,10 +91,11 @@ class PartitionedFilterBlockReader : public FilterBlockReader, virtual size_t ApproximateMemoryUsage() const override; private: - Slice GetFilterPartitionHandle(const Slice& entry); + BlockHandle GetFilterPartitionHandle(const Slice& entry); BlockBasedTable::CachableEntry GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io, - bool* cached, const SliceTransform* prefix_extractor = nullptr); + FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, + const bool no_io, bool* cached, + const SliceTransform* prefix_extractor = nullptr); virtual void CacheDependencies( bool bin, const SliceTransform* prefix_extractor) override; @@ -101,6 +104,7 @@ class PartitionedFilterBlockReader : public FilterBlockReader, const InternalKeyComparator comparator_; const BlockBasedTable* table_; const bool index_key_includes_seq_; + const bool index_value_is_full_; std::unordered_map> filter_map_; diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc index feb0c99c6..150eac6a8 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/partitioned_filter_block_test.cc @@ -100,7 +100,9 @@ class PartitionedFilterBlockTest : public testing::Test { } PartitionedIndexBuilder* NewIndexBuilder() { - return PartitionedIndexBuilder::CreateIndexBuilder(&icomp, table_options_); + const bool kValueDeltaEncoded = true; + return PartitionedIndexBuilder::CreateIndexBuilder( + &icomp, !kValueDeltaEncoded, table_options_); } PartitionedFilterBlockBuilder* NewBuilder( @@ -113,11 +115,12 @@ class PartitionedFilterBlockTest : public testing::Test { 99) / 100); partition_size = std::max(partition_size, static_cast(1)); + const bool kValueDeltaEncoded = true; return new PartitionedFilterBlockBuilder( prefix_extractor, table_options_.whole_key_filtering, table_options_.filter_policy->GetFilterBitsBuilder(), - table_options_.index_block_restart_interval, p_index_builder, - partition_size); + table_options_.index_block_restart_interval, !kValueDeltaEncoded, + p_index_builder, partition_size); } std::unique_ptr table; @@ -143,7 +146,8 @@ class PartitionedFilterBlockTest : public testing::Test { !kSkipFilters, !kImmortal))); auto reader = new PartitionedFilterBlockReader( prefix_extractor, true, BlockContents(slice, false, kNoCompression), - nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq()); + nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq(), + !pib->get_use_value_delta_encoding()); return reader; } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 8dd774e4a..df08a98fa 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -38,7 +38,6 @@ class TableReader; class InternalKeyComparator; class PlainTableKeyDecoder; class GetContext; -class InternalIterator; using std::unique_ptr; using std::unordered_map; diff --git a/table/table_properties.cc b/table/table_properties.cc index 9c1c4bd8e..207a64191 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -94,8 +94,9 @@ std::string TableProperties::ToString( AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); char index_block_size_str[80]; snprintf(index_block_size_str, sizeof(index_block_size_str), - "index block size (user-key? %d)", - static_cast(index_key_is_user_key)); + "index block size (user-key? %d, delta-value? %d)", + static_cast(index_key_is_user_key), + static_cast(index_value_is_delta_encoded)); AppendProperty(result, index_block_size_str, index_size, prop_delim, kv_delim); if (index_partitions != 0) { @@ -163,6 +164,7 @@ void TableProperties::Add(const TableProperties& tp) { index_partitions += tp.index_partitions; top_level_index_size += tp.top_level_index_size; index_key_is_user_key += tp.index_key_is_user_key; + index_value_is_delta_encoded += tp.index_value_is_delta_encoded; filter_size += tp.filter_size; raw_key_size += tp.raw_key_size; raw_value_size += tp.raw_value_size; @@ -181,6 +183,8 @@ const std::string TablePropertiesNames::kTopLevelIndexSize = "rocksdb.top-level.index.size"; const std::string TablePropertiesNames::kIndexKeyIsUserKey = "rocksdb.index.key.is.user.key"; +const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded = + "rocksdb.index.value.is.delta.encoded"; const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size"; const std::string TablePropertiesNames::kRawKeySize = diff --git a/table/table_properties_internal.h b/table/table_properties_internal.h index b4e95750b..888b43d24 100644 --- a/table/table_properties_internal.h +++ b/table/table_properties_internal.h @@ -10,7 +10,6 @@ namespace rocksdb { -class InternalIterator; class BlockHandle; // Seek to the properties block. diff --git a/table/table_reader.h b/table/table_reader.h index b51b44b67..505b5ba1f 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -21,7 +21,6 @@ class Arena; struct ReadOptions; struct TableProperties; class GetContext; -class InternalIterator; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from diff --git a/table/table_test.cc b/table/table_test.cc index 9ce98ab47..42220ad1e 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2988,9 +2988,13 @@ TEST_F(HarnessTest, FooterTests) { class IndexBlockRestartIntervalTest : public TableTest, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface> { public: - static std::vector GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; } + static std::vector> GetRestartValues() { + return {{-1, false}, {0, false}, {1, false}, {8, false}, + {16, false}, {32, false}, {-1, true}, {0, true}, + {1, true}, {8, true}, {16, true}, {32, true}}; + } }; INSTANTIATE_TEST_CASE_P( @@ -3002,12 +3006,16 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { const int kKeySize = 100; const int kValSize = 500; - int index_block_restart_interval = GetParam(); + const int index_block_restart_interval = std::get<0>(GetParam()); + const bool value_delta_encoding = std::get<1>(GetParam()); Options options; BlockBasedTableOptions table_options; table_options.block_size = 64; // small block size to get big index block table_options.index_block_restart_interval = index_block_restart_interval; + if (value_delta_encoding) { + table_options.format_version = 4; + } options.table_factory.reset(new BlockBasedTableFactory(table_options)); TableConstructor c(BytewiseComparator()); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 09e0e1ef1..58ab61c69 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -19,12 +19,13 @@ namespace rocksdb { namespace { -class TwoLevelIterator : public InternalIterator { +class TwoLevelIndexIterator : public InternalIteratorBase { public: - explicit TwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter); + explicit TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); - virtual ~TwoLevelIterator() { + virtual ~TwoLevelIndexIterator() { first_level_iter_.DeleteIter(false /* is_arena_mode */); second_level_iter_.DeleteIter(false /* is_arena_mode */); delete state_; @@ -42,7 +43,7 @@ class TwoLevelIterator : public InternalIterator { assert(Valid()); return second_level_iter_.key(); } - virtual Slice value() const override { + virtual BlockHandle value() const override { assert(Valid()); return second_level_iter_.value(); } @@ -68,23 +69,24 @@ class TwoLevelIterator : public InternalIterator { } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetSecondLevelIterator(InternalIterator* iter); + void SetSecondLevelIterator(InternalIteratorBase* iter); void InitDataBlock(); TwoLevelIteratorState* state_; - IteratorWrapper first_level_iter_; - IteratorWrapper second_level_iter_; // May be nullptr + IteratorWrapperBase first_level_iter_; + IteratorWrapperBase second_level_iter_; // May be nullptr Status status_; // If second_level_iter is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the second_level_iter. - std::string data_block_handle_; + BlockHandle data_block_handle_; }; -TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter) +TwoLevelIndexIterator::TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) : state_(state), first_level_iter_(first_level_iter) {} -void TwoLevelIterator::Seek(const Slice& target) { +void TwoLevelIndexIterator::Seek(const Slice& target) { first_level_iter_.Seek(target); InitDataBlock(); @@ -94,7 +96,7 @@ void TwoLevelIterator::Seek(const Slice& target) { SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::SeekForPrev(const Slice& target) { +void TwoLevelIndexIterator::SeekForPrev(const Slice& target) { first_level_iter_.Seek(target); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -112,7 +114,7 @@ void TwoLevelIterator::SeekForPrev(const Slice& target) { } } -void TwoLevelIterator::SeekToFirst() { +void TwoLevelIndexIterator::SeekToFirst() { first_level_iter_.SeekToFirst(); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -121,7 +123,7 @@ void TwoLevelIterator::SeekToFirst() { SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::SeekToLast() { +void TwoLevelIndexIterator::SeekToLast() { first_level_iter_.SeekToLast(); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -130,19 +132,19 @@ void TwoLevelIterator::SeekToLast() { SkipEmptyDataBlocksBackward(); } -void TwoLevelIterator::Next() { +void TwoLevelIndexIterator::Next() { assert(Valid()); second_level_iter_.Next(); SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::Prev() { +void TwoLevelIndexIterator::Prev() { assert(Valid()); second_level_iter_.Prev(); SkipEmptyDataBlocksBackward(); } -void TwoLevelIterator::SkipEmptyDataBlocksForward() { +void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() { while (second_level_iter_.iter() == nullptr || (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { // Move to next block @@ -158,7 +160,7 @@ void TwoLevelIterator::SkipEmptyDataBlocksForward() { } } -void TwoLevelIterator::SkipEmptyDataBlocksBackward() { +void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { while (second_level_iter_.iter() == nullptr || (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { // Move to next block @@ -174,24 +176,26 @@ void TwoLevelIterator::SkipEmptyDataBlocksBackward() { } } -void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) { - InternalIterator* old_iter = second_level_iter_.Set(iter); +void TwoLevelIndexIterator::SetSecondLevelIterator( + InternalIteratorBase* iter) { + InternalIteratorBase* old_iter = second_level_iter_.Set(iter); delete old_iter; } -void TwoLevelIterator::InitDataBlock() { +void TwoLevelIndexIterator::InitDataBlock() { if (!first_level_iter_.Valid()) { SetSecondLevelIterator(nullptr); } else { - Slice handle = first_level_iter_.value(); + BlockHandle handle = first_level_iter_.value(); if (second_level_iter_.iter() != nullptr && !second_level_iter_.status().IsIncomplete() && - handle.compare(data_block_handle_) == 0) { + handle.offset() == data_block_handle_.offset()) { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - InternalIterator* iter = state_->NewSecondaryIterator(handle); - data_block_handle_.assign(handle.data(), handle.size()); + InternalIteratorBase* iter = + state_->NewSecondaryIterator(handle); + data_block_handle_ = handle; SetSecondLevelIterator(iter); } } @@ -199,8 +203,9 @@ void TwoLevelIterator::InitDataBlock() { } // namespace -InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter) { - return new TwoLevelIterator(state, first_level_iter); +InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) { + return new TwoLevelIndexIterator(state, first_level_iter); } } // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index d1c8f91af..55d5c01a4 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -22,7 +22,8 @@ struct TwoLevelIteratorState { TwoLevelIteratorState() {} virtual ~TwoLevelIteratorState() {} - virtual InternalIterator* NewSecondaryIterator(const Slice& handle) = 0; + virtual InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& handle) = 0; }; @@ -36,7 +37,8 @@ struct TwoLevelIteratorState { // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. // Note: this function expects first_level_iter was not created using the arena -extern InternalIterator* NewTwoLevelIterator( - TwoLevelIteratorState* state, InternalIterator* first_level_iter); +extern InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); } // namespace rocksdb diff --git a/util/coding.h b/util/coding.h index 803e4349d..27a638347 100644 --- a/util/coding.h +++ b/util/coding.h @@ -64,12 +64,27 @@ extern Slice GetLengthPrefixedSlice(const char* data); extern Slice GetSliceUntil(Slice* slice, char delimiter); +// Borrowed from https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208 +constexpr inline uint64_t i64ToZigzag(const int64_t l) { + return (static_cast(l) << 1) ^ static_cast(l >> 63); +} +inline int64_t zigzagToI64(uint64_t n) { + return (n >> 1) ^ -static_cast(n & 1); +} + // Pointer-based variants of GetVarint... These either store a value // in *v and return a pointer just past the parsed value, or return // nullptr on error. These routines only look at bytes in the range // [p..limit-1] extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); +inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, + int64_t* value) { + uint64_t u = 0; + const char* ret = GetVarint64Ptr(p, limit, &u); + *value = zigzagToI64(u); + return ret; +} // Returns the length of the varint32 or varint64 encoding of "v" extern int VarintLength(uint64_t v); @@ -249,11 +264,18 @@ inline char* EncodeVarint64(char* dst, uint64_t v) { } inline void PutVarint64(std::string* dst, uint64_t v) { - char buf[10]; + char buf[kMaxVarint64Length]; char* ptr = EncodeVarint64(buf, v); dst->append(buf, static_cast(ptr - buf)); } +inline void PutVarsignedint64(std::string* dst, int64_t v) { + char buf[kMaxVarint64Length]; + // Using Zigzag format to convert signed to unsigned + char* ptr = EncodeVarint64(buf, i64ToZigzag(v)); + dst->append(buf, static_cast(ptr - buf)); +} + inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) { char buf[20]; char* ptr = EncodeVarint64(buf, v1); diff --git a/util/testutil.cc b/util/testutil.cc index 7625d20ee..6094d7ba0 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -20,7 +20,7 @@ namespace rocksdb { namespace test { const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version; -const uint32_t kLatestFormatVersion = 3u; +const uint32_t kLatestFormatVersion = 4u; Slice RandomString(Random* rnd, int len, std::string* dst) { dst->resize(len); diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index ea8b1717e..d024cecb2 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -406,7 +406,7 @@ class TransactionTestBase : public ::testing::Test { if (empty_wal) { ASSERT_OK(s); } else { - // Test that we can detect the WAL that is produced by an incompatbile + // Test that we can detect the WAL that is produced by an incompatible // WritePolicy and fail fast before mis-interpreting the WAL. ASSERT_TRUE(s.IsNotSupported()); return;