diff --git a/db/db_iter.cc b/db/db_iter.cc index 8af13b7fa..df7e1b450 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -47,13 +47,14 @@ static void DumpInternalIter(Iterator* iter) { #endif DBIter::DBIter(Env* _env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const Comparator* cmp, - InternalIterator* iter, SequenceNumber s, bool arena_mode, - uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob) - : env_(_env), + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const Comparator* cmp, InternalIterator* iter, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob) + : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + env_(_env), logger_(cf_options.info_log), user_comparator_(cmp), merge_operator_(cf_options.merge_operator), @@ -68,7 +69,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, valid_(false), current_entry_is_merged_(false), is_key_seqnum_zero_(false), - prefix_same_as_start_(read_options.prefix_same_as_start), + prefix_same_as_start_(mutable_cf_options.prefix_extractor + ? read_options.prefix_same_as_start + : false), pin_thru_lifetime_(read_options.pin_data), total_order_seek_(read_options.total_order_seek), allow_blob_(allow_blob), @@ -79,7 +82,6 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, cfd_(cfd), start_seqnum_(read_options.iter_start_seqnum) { RecordTick(statistics_, NO_ITERATOR_CREATED); - prefix_extractor_ = mutable_cf_options.prefix_extractor.get(); max_skip_ = max_sequential_skip_in_iterations; max_skippable_internal_keys_ = read_options.max_skippable_internal_keys; if (pin_thru_lifetime_) { @@ -152,8 +154,13 @@ void DBIter::Next() { local_stats_.next_count_++; if (ok && iter_.Valid()) { + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } FindNextUserEntry(true /* skipping the current user key */, - prefix_same_as_start_); + prefix_same_as_start_ ? &prefix : nullptr); } else { is_key_seqnum_zero_ = false; valid_ = false; @@ -164,7 +171,7 @@ void DBIter::Next() { } } -// PRE: saved_key_ has the current user key if skipping +// PRE: saved_key_ has the current user key if skipping_saved_key // POST: saved_key_ should have the next user key if valid_, // if the current entry is a result of merge // current_entry_is_merged_ => true @@ -174,17 +181,17 @@ void DBIter::Next() { // a delete marker or a sequence number higher than sequence_ // saved_key_ MUST have a proper user_key before calling this function // -// The prefix_check parameter controls whether we check the iterated -// keys against the prefix of the seeked key. Set to false when -// performing a seek without a key (e.g. SeekToFirst). Set to -// prefix_same_as_start_ for other iterations. -bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) { +// The prefix parameter, if not null, indicates that we need to iterator +// within the prefix, and the iterator needs to be made invalid, if no +// more entry for the prefix can be found. +bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) { PERF_TIMER_GUARD(find_next_user_entry_time); - return FindNextUserEntryInternal(skipping, prefix_check); + return FindNextUserEntryInternal(skipping_saved_key, prefix); } // Actual implementation of DBIter::FindNextUserEntry() -bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { +bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, + const Slice* prefix) { // Loop until we hit an acceptable entry to yield assert(iter_.Valid()); assert(status_.ok()); @@ -193,9 +200,10 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { // How many times in a row we have skipped an entry with user key less than // or equal to saved_key_. We could skip these entries either because - // sequence numbers were too high or because skipping = true. + // sequence numbers were too high or because skipping_saved_key = true. // What saved_key_ contains throughout this method: - // - if skipping : saved_key_ contains the key that we need to skip, + // - if skipping_saved_key : saved_key_ contains the key that we need + // to skip, // and we haven't seen any keys greater than that, // - if num_skipped > 0 : saved_key_ contains the key that we have skipped // num_skipped times, and we haven't seen any keys @@ -228,9 +236,10 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { break; } - if (prefix_extractor_ && prefix_check && - prefix_extractor_->Transform(ikey_.user_key) - .compare(prefix_start_key_) != 0) { + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) { + assert(prefix_same_as_start_); break; } @@ -243,14 +252,15 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { // possibly be skipped. This condition can potentially be relaxed to // prev_key.seq <= ikey_.sequence. We are cautious because it will be more // prone to bugs causing the same user key with the same sequence number. - if (!is_prev_key_seqnum_zero && skipping && + if (!is_prev_key_seqnum_zero && skipping_saved_key && user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <= 0) { num_skipped++; // skip this entry PERF_COUNTER_ADD(internal_key_skipped_count, 1); } else { - assert(!skipping || user_comparator_.Compare( - ikey_.user_key, saved_key_.GetUserKey()) > 0); + assert(!skipping_saved_key || + user_comparator_.Compare(ikey_.user_key, + saved_key_.GetUserKey()) > 0); num_skipped = 0; reseek_done = false; switch (ikey_.type) { @@ -271,7 +281,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { saved_key_.SetUserKey( ikey_.user_key, !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); - skipping = true; + skipping_saved_key = true; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); } break; @@ -289,12 +299,12 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { return true; } else { // this key and all previous versions shouldn't be included, - // skipping + // skipping_saved_key saved_key_.SetUserKey( ikey_.user_key, !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); - skipping = true; + skipping_saved_key = true; } } else { saved_key_.SetUserKey( @@ -304,7 +314,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { ikey_, RangeDelPositioningMode::kForwardTraversal)) { // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. - skipping = true; + skipping_saved_key = true; num_skipped = 0; reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); @@ -335,7 +345,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { ikey_, RangeDelPositioningMode::kForwardTraversal)) { // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. - skipping = true; + skipping_saved_key = true; num_skipped = 0; reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); @@ -360,13 +370,13 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { // to seek to the target sequence number. int cmp = user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()); - if (cmp == 0 || (skipping && cmp <= 0)) { + if (cmp == 0 || (skipping_saved_key && cmp <= 0)) { num_skipped++; } else { saved_key_.SetUserKey( ikey_.user_key, !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - skipping = false; + skipping_saved_key = false; num_skipped = 0; reseek_done = false; } @@ -386,14 +396,14 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { num_skipped = 0; reseek_done = true; std::string last_key; - if (skipping) { + if (skipping_saved_key) { // We're looking for the next user-key but all we see are the same // user-key with decreasing sequence numbers. Fast forward to // sequence number 0 and type deletion (the smallest type). AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion)); - // Don't set skipping = false because we may still see more user-keys - // equal to saved_key_. + // Don't set skipping_saved_key = false because we may still see more + // user-keys equal to saved_key_. } else { // We saw multiple entries with this user key and sequence numbers // higher than sequence_. Fast forward to sequence_. @@ -534,8 +544,14 @@ void DBIter::Prev() { } } if (ok) { - PrevInternal(); + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } + PrevInternal(prefix_same_as_start_ ? &prefix : nullptr); } + if (statistics_ != nullptr) { local_stats_.prev_count_++; if (valid_) { @@ -613,15 +629,17 @@ bool DBIter::ReverseToBackward() { return FindUserKeyBeforeSavedKey(); } -void DBIter::PrevInternal() { +void DBIter::PrevInternal(const Slice* prefix) { while (iter_.Valid()) { saved_key_.SetUserKey( ExtractUserKey(iter_.key()), !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - if (prefix_extractor_ && prefix_same_as_start_ && + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && prefix_extractor_->Transform(saved_key_.GetUserKey()) - .compare(prefix_start_key_) != 0) { + .compare(*prefix) != 0) { + assert(prefix_same_as_start_); // Current key does not have the same prefix as start valid_ = false; return; @@ -1047,120 +1065,152 @@ bool DBIter::IsVisible(SequenceNumber sequence) { } } -void DBIter::Seek(const Slice& target) { - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); - StopWatch sw(env_, statistics_, DB_SEEK); - status_ = Status::OK(); - ReleaseTempPinnedData(); - ResetInternalKeysSkippedCounter(); +void DBIter::SetSavedKeyToSeekTarget(const Slice& target) { is_key_seqnum_zero_ = false; - SequenceNumber seq = sequence_; saved_key_.Clear(); saved_key_.SetInternalKey(target, seq); -#ifndef ROCKSDB_LITE - if (db_impl_ != nullptr && cfd_ != nullptr) { - db_impl_->TraceIteratorSeek(cfd_->GetID(), target); - } -#endif // ROCKSDB_LITE - if (iterate_lower_bound_ != nullptr && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { + // Seek key is smaller than the lower bound. saved_key_.Clear(); saved_key_.SetInternalKey(*iterate_lower_bound_, seq); } +} + +void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + saved_key_.Clear(); + // now saved_key is used to store internal key. + saved_key_.SetInternalKey(target, 0 /* sequence_number */, + kValueTypeForSeekForPrev); + + if (iterate_upper_bound_ != nullptr && + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_upper_bound_) >= 0) { + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber); + } +} + +void DBIter::Seek(const Slice& target) { + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + StopWatch sw(env_, statistics_, DB_SEEK); + +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + db_impl_->TraceIteratorSeek(cfd_->GetID(), target); + } +#endif // ROCKSDB_LITE + + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetInternalKeysSkippedCounter(); + // Seek the inner iterator based on the target key. { PERF_TIMER_GUARD(seek_internal_seek_time); + + SetSavedKeyToSeekTarget(target); iter_.Seek(saved_key_.GetInternalKey()); + range_del_agg_.InvalidateRangeDelMapPositions(); + RecordTick(statistics_, NUMBER_DB_SEEK); } - RecordTick(statistics_, NUMBER_DB_SEEK); - if (iter_.Valid()) { - if (prefix_extractor_ && prefix_same_as_start_) { - prefix_start_key_ = prefix_extractor_->Transform(target); - } - direction_ = kForward; - ClearSavedValue(); - FindNextUserEntry(false /* not skipping */, prefix_same_as_start_); - if (!valid_) { - prefix_start_key_.clear(); - } - if (statistics_ != nullptr) { - if (valid_) { - // Decrement since we don't want to count this key as skipped - RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); - PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); - } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kForward; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the next key that is visible to the user. + // + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix; + target_prefix = prefix_extractor_->Transform(target); + FindNextUserEntry(false /* not skipping saved_key */, + &target_prefix /* prefix */); + if (valid_) { + // Remember the prefix of the seek key for the future Prev() call to + // check. + prefix_.SetUserKey(target_prefix); } } else { - valid_ = false; + FindNextUserEntry(false /* not skipping saved_key */, nullptr); + } + if (!valid_) { + return; } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey(prefix_start_key_); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + // Updating stats and perf context counters. + if (statistics_ != nullptr) { + // Decrement since we don't want to count this key as skipped + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); } + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); } void DBIter::SeekForPrev(const Slice& target) { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); StopWatch sw(env_, statistics_, DB_SEEK); + +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target); + } +#endif // ROCKSDB_LITE + status_ = Status::OK(); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); - is_key_seqnum_zero_ = false; - saved_key_.Clear(); - // now saved_key is used to store internal key. - saved_key_.SetInternalKey(target, 0 /* sequence_number */, - kValueTypeForSeekForPrev); - - if (iterate_upper_bound_ != nullptr && - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_upper_bound_) >= 0) { - saved_key_.Clear(); - saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber); - } + // Seek the inner iterator based on the target key. { PERF_TIMER_GUARD(seek_internal_seek_time); + SetSavedKeyToSeekForPrevTarget(target); iter_.SeekForPrev(saved_key_.GetInternalKey()); range_del_agg_.InvalidateRangeDelMapPositions(); + RecordTick(statistics_, NUMBER_DB_SEEK); } - -#ifndef ROCKSDB_LITE - if (db_impl_ != nullptr && cfd_ != nullptr) { - db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target); + if (!iter_.Valid()) { + valid_ = false; + return; } -#endif // ROCKSDB_LITE + direction_ = kReverse; - RecordTick(statistics_, NUMBER_DB_SEEK); - if (iter_.Valid()) { - if (prefix_extractor_ && prefix_same_as_start_) { - prefix_start_key_ = prefix_extractor_->Transform(target); - } - direction_ = kReverse; - ClearSavedValue(); - PrevInternal(); - if (!valid_) { - prefix_start_key_.clear(); - } - if (statistics_ != nullptr) { - if (valid_) { - RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); - PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); - } + // Now the inner iterator is placed to the target position. From there, + // we need to find out the first key that is visible to the user in the + // backward direction. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix; + target_prefix = prefix_extractor_->Transform(target); + PrevInternal(&target_prefix); + if (valid_) { + // Remember the prefix of the seek key for the future Prev() call to + // check. + prefix_.SetUserKey(target_prefix); } } else { - valid_ = false; + PrevInternal(nullptr); } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey(prefix_start_key_); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + + // Report stats and perf context. + if (statistics_ != nullptr && valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); } } @@ -1193,7 +1243,8 @@ void DBIter::SeekToFirst() { saved_key_.SetUserKey( ExtractUserKey(iter_.key()), !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - FindNextUserEntry(false /* not skipping */, false /* no prefix check */); + FindNextUserEntry(false /* not skipping saved_key */, + nullptr /* no prefix check */); if (statistics_ != nullptr) { if (valid_) { RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); @@ -1204,10 +1255,9 @@ void DBIter::SeekToFirst() { } else { valid_ = false; } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey( - prefix_extractor_->Transform(saved_key_.GetUserKey())); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); } } @@ -1217,7 +1267,7 @@ void DBIter::SeekToLast() { SeekForPrev(*iterate_upper_bound_); if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) { ReleaseTempPinnedData(); - PrevInternal(); + PrevInternal(nullptr); } return; } @@ -1240,7 +1290,7 @@ void DBIter::SeekToLast() { iter_.SeekToLast(); range_del_agg_.InvalidateRangeDelMapPositions(); } - PrevInternal(); + PrevInternal(nullptr); if (statistics_ != nullptr) { RecordTick(statistics_, NUMBER_DB_SEEK); if (valid_) { @@ -1249,10 +1299,9 @@ void DBIter::SeekToLast() { PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); } } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey( - prefix_extractor_->Transform(saved_key_.GetUserKey())); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); } } diff --git a/db/db_iter.h b/db/db_iter.h index d1b60ca8f..15ee56d50 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -203,15 +203,28 @@ class DBIter final: public Iterator { // in this case callers would usually stop what they were doing and return. bool ReverseToForward(); bool ReverseToBackward(); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is smaller than iterator lower bound. + void SetSavedKeyToSeekTarget(const Slice& /*target*/); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is larger than iterator upper bound. + void SetSavedKeyToSeekForPrevTarget(const Slice& /*target*/); bool FindValueForCurrentKey(); bool FindValueForCurrentKeyUsingSeek(); bool FindUserKeyBeforeSavedKey(); - bool FindNextUserEntry(bool skipping, bool prefix_check); - bool FindNextUserEntryInternal(bool skipping, bool prefix_check); + // If `skipping_saved_key` is true, the function will keep iterating until it + // finds a user key that is larger than `saved_key_`. + // If `prefix` is not null, the iterator needs to stop when all keys for the + // prefix are exhausted and the interator is set to invalid. + bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); + // Internal implementation of FindNextUserEntry(). + bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); bool ParseKey(ParsedInternalKey* key); bool MergeValuesNewToOld(); - void PrevInternal(); + // If prefix is not null, we need to set the iterator to invalid if no more + // entry can be found within the prefix. + void PrevInternal(const Slice* /*prefix*/); bool TooManyInternalKeysSkipped(bool increment = true); bool IsVisible(SequenceNumber sequence); @@ -273,10 +286,14 @@ class DBIter final: public Iterator { const Slice* iterate_lower_bound_; const Slice* iterate_upper_bound_; - IterKey prefix_start_buf_; + // The prefix of the seek key. It is only used when prefix_same_as_start_ + // is true and prefix extractor is not null. In Next() or Prev(), current keys + // will be checked against this prefix, so that the iterator can be + // invalidated if the keys in this prefix has been exhausted. Set it using + // SetUserKey() and use it using GetUserKey(). + IterKey prefix_; Status status_; - Slice prefix_start_key_; Direction direction_; bool valid_; bool current_entry_is_merged_;