Fix WriteBatchWithIndex's SeekForPrev() (#4559)

Summary: WriteBatchWithIndex's SeekForPrev() has a bug that we internally place the position just before the seek key rather than after. This makes the iterator to miss the result that is the same as the seek key. Fix it by position the iterator equal or smaller. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4559 Differential Revision: D10468534 Pulled By: siying fbshipit-source-id: 2fb371ae809c561b60a1c11cef71e1c66fea1f19
7 years ago · c17383f918
parent da4aa59b4c
commit c17383f918
5 changed files with 83 additions and 22 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -9,6 +9,7 @@
 * Fix in-memory range tombstone truncation to avoid erroneously covering newer keys at a lower level, and include range tombstones in compacted files whose largest key is the range tombstone's start key.
 * Properly set the stop key for a truncated manual CompactRange
 * Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
+* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.

 ## 5.17.0 (10/05/2018)
 ### Public API Change
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@ -352,14 +352,16 @@ class WBWIIteratorImpl : public WBWIIterator {
  }

  virtual void SeekToFirst() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_, 0, 0);
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
    skip_list_iter_.Seek(&search_entry);
  }

  virtual void SeekToLast() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_ + 1, 0, 0);
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_ + 1,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
    skip_list_iter_.Seek(&search_entry);
    if (!skip_list_iter_.Valid()) {
      skip_list_iter_.SeekToLast();
@ -369,12 +371,16 @@ class WBWIIteratorImpl : public WBWIIterator {
  }

  virtual void Seek(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      true /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
    skip_list_iter_.Seek(&search_entry);
  }

  virtual void SeekForPrev(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      false /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
    skip_list_iter_.SeekForPrev(&search_entry);
  }

--- a/utilities/write_batch_with_index/write_batch_with_index_internal.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@ -85,6 +85,20 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
  return Status::OK();
 }

+// If both of `entry1` and `entry2` point to real entry in write batch, we
+// compare the entries as following:
+// 1. first compare the column family, the one with larger CF will be larger;
+// 2. Inside the same CF, we first decode the entry to find the key of the entry
+//    and the entry with larger key will be larger;
+// 3. If two entries are of the same CF and offset, the one with larger offset
+//    will be larger.
+// Some times either `entry1` or `entry2` is dummy entry, which is actually
+// a search key. In this case, in step 2, we don't go ahead and decode the
+// entry but use the value in WriteBatchIndexEntry::search_key.
+// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf.
+// This indicate that we are going to seek to the first of the column family.
+// Once we see this, this entry will be smaller than all the real entries of
+// the column family.
 int WriteBatchEntryComparator::operator()(
    const WriteBatchIndexEntry* entry1,
    const WriteBatchIndexEntry* entry2) const {
@ -94,9 +108,10 @@ int WriteBatchEntryComparator::operator()(
    return -1;
  }

-  if (entry1->offset == WriteBatchIndexEntry::kFlagMin) {
+  // Deal with special case of seeking to the beginning of a column family
+  if (entry1->is_min_in_cf()) {
    return -1;
-  } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) {
+  } else if (entry2->is_min_in_cf()) {
    return 1;
  }

--- a/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h
@ -31,21 +31,52 @@ struct WriteBatchIndexEntry {
        key_offset(ko),
        key_size(ksz),
        search_key(nullptr) {}
-  WriteBatchIndexEntry(const Slice* sk, uint32_t c)
-      : offset(0),
-        column_family(c),
+  // Create a dummy entry as the search key. This index entry won't be backed
+  // by an entry from the write batch, but a pointer to the search key. Or a
+  // special flag of offset can indicate we are seek to first.
+  // @_search_key: the search key
+  // @_column_family: column family
+  // @is_forward_direction: true for Seek(). False for SeekForPrev()
+  // @is_seek_to_first: true if we seek to the beginning of the column family
+  //                    _search_key should be null in this case.
+  WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
+                       bool is_forward_direction, bool is_seek_to_first)
+      // For SeekForPrev(), we need to make the dummy entry larger than any
+      // entry who has the same search key. Otherwise, we'll miss those entries.
+      : offset(is_forward_direction ? 0 : port::kMaxSizet),
+        column_family(_column_family),
        key_offset(0),
-        key_size(0),
-        search_key(sk) {}
+        key_size(is_seek_to_first ? kFlagMinInCf : 0),
+        search_key(_search_key) {
+    assert(_search_key != nullptr || is_seek_to_first);
+  }

-  // If this flag appears in the offset, it indicates a key that is smaller
-  // than any other entry for the same column family
-  static const size_t kFlagMin = port::kMaxSizet;
+  // If this flag appears in the key_size, it indicates a
+  // key that is smaller than any other entry for the same column family.
+  static const size_t kFlagMinInCf = port::kMaxSizet;
+
+  bool is_min_in_cf() const {
+    assert(key_size != kFlagMinInCf ||
+           (key_offset == 0 && search_key == nullptr));
+    return key_size == kFlagMinInCf;
+  }

-  size_t offset;           // offset of an entry in write batch's string buffer.
-  uint32_t column_family;  // column family of the entry.
+  // offset of an entry in write batch's string buffer. If this is a dummy
+  // lookup key, in which case search_key != nullptr, offset is set to either
+  // 0 or max, only for comparison purpose. Because when entries have the same
+  // key, the entry with larger offset is larger, offset = 0 will make a seek
+  // key small or equal than all the entries with the seek key, so that Seek()
+  // will find all the entries of the same key. Similarly, offset = MAX will
+  // make the entry just larger than all entries with the search key so
+  // SeekForPrev() will see all the keys with the same key.
+  size_t offset;
+  uint32_t column_family;  // c1olumn family of the entry.
  size_t key_offset;       // offset of the key in write batch's string buffer.
-  size_t key_size;         // size of the key.
+  size_t key_size;         // size of the key. kFlagMinInCf indicates
+                           // that this is a dummy look up entry for
+                           // SeekToFirst() to the beginning of the column
+                           // family. We use the flag here to save a boolean
+                           // in the struct.

  const Slice* search_key;  // if not null, instead of reading keys from
                            // write batch, use it to compare. This is used
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@ -621,7 +621,7 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
    for (int i = 0; i < 128; i++) {
      // Random walk and make sure iter and result_iter returns the
      // same key and value
-      int type = rnd.Uniform(5);
+      int type = rnd.Uniform(6);
      ASSERT_OK(iter->status());
      switch (type) {
        case 0:
@ -642,7 +642,15 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
          result_iter->Seek(key);
          break;
        }
-        case 3:
+        case 3: {
+          // SeekForPrev to random key
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+          auto key = source_strings[key_idx];
+          iter->SeekForPrev(key);
+          result_iter->SeekForPrev(key);
+          break;
+        }
+        case 4:
          // Next
          if (is_valid) {
            iter->Next();
@ -652,7 +660,7 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
          }
          break;
        default:
-          assert(type == 4);
+          assert(type == 5);
          // Prev
          if (is_valid) {
            iter->Prev();