From 230bcae7b61c03ded4fd65e9d157c66348440c95 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Tue, 5 Nov 2019 11:29:31 -0800 Subject: [PATCH] Add a limited support for iteration bounds into BaseDeltaIterator (#5403) Summary: For MDEV-19670: MyRocks: key lookups into deleted data are very slow BaseDeltaIterator remembers iterate_upper_bound and will not let delta_iterator_ walk above the iterate_upper_bound if base_iterator_ is not valid anymore. == Rationale == The most straightforward way would be to make the delta_iterator (which is a rocksdb::WBWIIterator) to support iterator bounds. But checking for bounds has an extra CPU overhead. So we put the check into BaseDeltaIterator, and only make it when base_iterator_ is not valid. (note: We could take it even further, and move the check a few lines down, and only check iterator bounds ourselves if base_iterator_ is not valid AND delta_iterator_ hit a tombstone). Pull Request resolved: https://github.com/facebook/rocksdb/pull/5403 Differential Revision: D15863092 Pulled By: maysamyabandeh fbshipit-source-id: 8da458e7b9af95ff49356666f69664b4a6ccf49b --- db/c_test.c | 1 + .../utilities/write_batch_with_index.h | 3 ++- utilities/transactions/transaction_base.cc | 3 ++- .../write_batch_with_index.cc | 22 +++++++++++++++---- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/db/c_test.c b/db/c_test.c index 4b4b165c8..e851aad53 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -1467,6 +1467,7 @@ int main(int argc, char** argv) { CheckCondition(!rocksdb_iter_valid(iter)); rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); } } diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 586088d75..26fb79517 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -165,7 +165,8 @@ class WriteBatchWithIndex : public WriteBatchBase { // the write batch update finishes. The state may recover after Next() is // called. Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family, - Iterator* base_iterator); + Iterator* base_iterator, + const ReadOptions *opts = nullptr); // default column family Iterator* NewIteratorWithBase(Iterator* base_iterator); diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index c10c1795f..04e664f6f 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -369,7 +369,8 @@ Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options, Iterator* db_iter = db_->NewIterator(read_options, column_family); assert(db_iter); - return write_batch_.NewIteratorWithBase(column_family, db_iter); + return write_batch_.NewIteratorWithBase(column_family, db_iter, + &read_options); } Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 2f51f4f9f..3eb14ebe8 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -33,14 +33,18 @@ namespace rocksdb { class BaseDeltaIterator : public Iterator { public: BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator, - const Comparator* comparator) + const Comparator* comparator, + const ReadOptions* read_options = nullptr) : forward_(true), current_at_base_(true), equal_keys_(false), status_(Status::OK()), base_iterator_(base_iterator), delta_iterator_(delta_iterator), - comparator_(comparator) {} + comparator_(comparator), + iterate_upper_bound_(read_options? read_options->iterate_upper_bound : + nullptr) + {} ~BaseDeltaIterator() override {} @@ -279,6 +283,13 @@ class BaseDeltaIterator : public Iterator { // Finished return; } + if (iterate_upper_bound_) { + if (comparator_->Compare(delta_entry.key, + *iterate_upper_bound_) >= 0) { + // out of upper bound -> finished. + return; + } + } if (delta_entry.type == kDeleteRecord || delta_entry.type == kSingleDeleteRecord) { AdvanceDelta(); @@ -326,6 +337,7 @@ class BaseDeltaIterator : public Iterator { std::unique_ptr base_iterator_; std::unique_ptr delta_iterator_; const Comparator* comparator_; // not owned + const Slice* iterate_upper_bound_; }; typedef SkipList @@ -647,13 +659,15 @@ WBWIIterator* WriteBatchWithIndex::NewIterator( } Iterator* WriteBatchWithIndex::NewIteratorWithBase( - ColumnFamilyHandle* column_family, Iterator* base_iterator) { + ColumnFamilyHandle* column_family, Iterator* base_iterator, + const ReadOptions *read_options) { if (rep->overwrite_key == false) { assert(false); return nullptr; } return new BaseDeltaIterator(base_iterator, NewIterator(column_family), - GetColumnFamilyUserComparator(column_family)); + GetColumnFamilyUserComparator(column_family), + read_options); } Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {