HashLinkList memtable switches a bucket to a skip list to reduce performance outliers

Summary: In this patch, we enhance HashLinkList memtable to reduce performance outliers when a bucket contains too many entries. We switch to skip list for this case to enable binary search. Add threshold_use_skiplist parameter to determine when a bucket needs to switch to skip list. The new data structure is documented in comments in the codes. Test Plan: make all check set threshold_use_skiplist in several tests Reviewers: yhchiang, haobo, ljin Reviewed By: yhchiang, ljin Subscribers: nkg-, xjin, dhruba, yhchiang, leveldb Differential Revision: https://reviews.facebook.net/D19299
11 years ago · 9c332aa11a
parent 6634844dba
commit 9c332aa11a
8 changed files with 386 additions and 86 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -2,6 +2,9 @@

 ## Unreleased

+### New Features
+* HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory().
+

 ## 3.2.0 (06/20/2014)

--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -488,7 +488,8 @@ class DBTest {
        break;
      case kHashLinkList:
        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
+        options.memtable_factory.reset(
+            NewHashLinkListRepFactory(4, 0, 3, true, 4));
        break;
      case kHashCuckoo:
        options.memtable_factory.reset(
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -297,6 +297,13 @@ class IterKey {
                   parsed_key_suffix.sequence, parsed_key_suffix.type);
  }

+  void EncodeLengthPrefixedKey(const Slice& key) {
+    auto size = key.size();
+    EnlargeBufferIfNeeded(size + VarintLength(size));
+    char* ptr = EncodeVarint32(key_, size);
+    memcpy(ptr, key.data(), size);
+  }
+
 private:
  char* key_;
  size_t buf_size_;
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@ -62,7 +62,7 @@ class PlainTableDBTest {
  Options CurrentOptions() {
    Options options;
    options.table_factory.reset(NewPlainTableFactory(0, 2, 0.8, 3, 0, kPrefix));
-    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 3));
    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
    options.allow_mmap_reads = true;
    return options;
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@ -189,6 +189,10 @@ class PrefixTest {
          options.memtable_factory.reset(
              NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
          return true;
+        case kHashLinkListTriggerSkipList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 0, 3));
+          return true;
        default:
          return false;
      }
@ -208,6 +212,7 @@ class PrefixTest {
    kHashSkipList,
    kHashLinkList,
    kHashLinkListHugePageTlb,
+    kHashLinkListTriggerSkipList,
    kEnd
  };
  int option_config_;
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@ -227,9 +227,10 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
    int32_t skiplist_branching_factor = 4
 );

-// The factory is to create memtables with a hashed linked list:
-// it contains a fixed array of buckets, each pointing to a sorted single
-// linked list (null if the bucket is empty).
+// The factory is to create memtables based on a hash table:
+// it contains a fixed array of buckets, each pointing to either a linked list
+// or a skip list if number of entries inside the bucket exceeds
+// threshold_use_skiplist.
 // @bucket_count: number of fixed array buckets
 // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
 //                      Otherwise from huge page TLB. The user needs to reserve
@ -240,10 +241,13 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
 //                                    exceeds this number, log about it.
 // @if_log_bucket_dist_when_flash: if true, log distribution of number of
 //                                 entries when flushing.
+// @threshold_use_skiplist: a bucket switches to skip list if number of
+//                          entries exceed this parameter.
 extern MemTableRepFactory* NewHashLinkListRepFactory(
    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
    int bucket_entries_logging_threshold = 4096,
-    bool if_log_bucket_dist_when_flash = true);
+    bool if_log_bucket_dist_when_flash = true,
+    uint32_t threshold_use_skiplist = 256);

 // This factory creates a cuckoo-hashing based mem-table representation.
 // Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@ -7,6 +7,7 @@
 #ifndef ROCKSDB_LITE
 #include "util/hash_linklist_rep.h"

+#include <algorithm>
 #include "rocksdb/memtablerep.h"
 #include "util/arena.h"
 #include "rocksdb/slice.h"
@ -22,6 +23,31 @@ namespace rocksdb {
 namespace {

 typedef const char* Key;
+typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
+typedef port::AtomicPointer Pointer;
+
+// A data structure used as the header of a link list of a hash bucket.
+struct BucketHeader {
+  Pointer next;
+  uint32_t num_entries;
+
+  explicit BucketHeader(void* n, uint32_t count)
+      : next(n), num_entries(count) {}
+
+  bool IsSkipListBucket() { return next.NoBarrier_Load() == this; }
+};
+
+// A data structure used as the header of a skip list of a hash bucket.
+struct SkipListBucketHeader {
+  BucketHeader Counting_header;
+  MemtableSkipList skip_list;
+
+  explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
+                                Arena* arena, uint32_t count)
+      : Counting_header(this,  // Pointing to itself to indicate header type.
+                        count),
+        skip_list(cmp, arena) {}
+};

 struct Node {
  // Accessors/mutators for links.  Wrapped in methods so we can
@ -51,12 +77,75 @@ struct Node {
  char key[0];
 };

+// Memory structure of the mem table:
+// It is a hash table, each bucket points to one entry, a linked list or a
+// skip list. In order to track total number of records in a bucket to determine
+// whether should switch to skip list, a header is added just to indicate
+// number of entries in the bucket.
+//
+//
+//          +-----> NULL    Case 1. Empty bucket
+//          |
+//          |
+//          | +---> +-------+
+//          | |     | Next  +--> NULL
+//          | |     +-------+
+//  +-----+ | |     |       |  Case 2. One Entry in bucket.
+//  |     +-+ |     | Data  |          next pointer points to
+//  +-----+   |     |       |          NULL. All other cases
+//  |     |   |     |       |          next pointer is not NULL.
+//  +-----+   |     +-------+
+//  |     +---+
+//  +-----+     +-> +-------+  +> +-------+  +-> +-------+
+//  |     |     |   | Next  +--+  | Next  +--+   | Next  +-->NULL
+//  +-----+     |   +-------+     +-------+      +-------+
+//  |     +-----+   | Count |     |       |      |       |
+//  +-----+         +-------+     | Data  |      | Data  |
+//  |     |                       |       |      |       |
+//  +-----+          Case 3.      |       |      |       |
+//  |     |          A header     +-------+      +-------+
+//  +-----+          points to
+//  |     |          a linked list. Count indicates total number
+//  +-----+          of rows in this bucket.
+//  |     |
+//  +-----+    +-> +-------+ <--+
+//  |     |    |   | Next  +----+
+//  +-----+    |   +-------+   Case 4. A header points to a skip
+//  |     +----+   | Count |           list and next pointer points to
+//  +-----+        +-------+           itself, to distinguish case 3 or 4.
+//  |     |        |       |           Count still is kept to indicates total
+//  +-----+        | Skip +-->         of entries in the bucket for debugging
+//  |     |        | List  |   Data    purpose.
+//  |     |        |      +-->
+//  +-----+        |       |
+//  |     |        +-------+
+//  +-----+
+//
+// We don't have data race when changing cases because:
+// (1) When changing from case 2->3, we create a new bucket header, put the
+//     single node there first without changing the original node, and do a
+//     release store when changing the bucket pointer. In that case, a reader
+//     who sees a stale value of the bucket pointer will read this node, while
+//     a reader sees the correct value because of the release store.
+// (2) When changing case 3->4, a new header is created with skip list points
+//     to the data, before doing an acquire store to change the bucket pointer.
+//     The old header and nodes are never changed, so any reader sees any
+//     of those existing pointers will guarantee to be able to iterate to the
+//     end of the linked list.
+// (3) Header's next pointer in case 3 might change, but they are never equal
+//     to itself, so no matter a reader sees any stale or newer value, it will
+//     be able to correctly distinguish case 3 and 4.
+//
+// The reason that we use case 2 is we want to make the format to be efficient
+// when the utilization of buckets is relatively low. If we use case 3 for
+// single entry bucket, we will need to waste 12 bytes for every entry,
+// which can be significant decrease of memory utilization.
 class HashLinkListRep : public MemTableRep {
 public:
  HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
                  const SliceTransform* transform, size_t bucket_size,
-                  size_t huge_page_tlb_size, Logger* logger,
-                  int bucket_entries_logging_threshold,
+                  uint32_t threshold_use_skiplist, size_t huge_page_tlb_size,
+                  Logger* logger, int bucket_entries_logging_threshold,
                  bool if_log_bucket_dist_when_flash);

  virtual KeyHandle Allocate(const size_t len, char** buf) override;
@ -80,7 +169,6 @@ class HashLinkListRep : public MemTableRep {

 private:
  friend class DynamicIterator;
-  typedef SkipList<const char*, const MemTableRep::KeyComparator&> FullList;

  size_t bucket_size_;

@ -88,6 +176,8 @@ class HashLinkListRep : public MemTableRep {
  // the same transform.
  port::AtomicPointer* buckets_;

+  const uint32_t threshold_use_skiplist_;
+
  // The user-supplied transform whose domain is the user keys.
  const SliceTransform* transform_;

@ -97,7 +187,12 @@ class HashLinkListRep : public MemTableRep {
  int bucket_entries_logging_threshold_;
  bool if_log_bucket_dist_when_flash_;

-  bool BucketContains(Node* head, const Slice& key) const;
+  bool LinkListContains(Node* head, const Slice& key) const;
+
+  SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
+      const;
+
+  Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;

  Slice GetPrefix(const Slice& internal_key) const {
    return transform_->Transform(ExtractUserKey(internal_key));
@ -107,11 +202,11 @@ class HashLinkListRep : public MemTableRep {
    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
  }

-  Node* GetBucket(size_t i) const {
-    return static_cast<Node*>(buckets_[i].Acquire_Load());
+  Pointer* GetBucket(size_t i) const {
+    return static_cast<Pointer*>(buckets_[i].Acquire_Load());
  }

-  Node* GetBucket(const Slice& slice) const {
+  Pointer* GetBucket(const Slice& slice) const {
    return GetBucket(GetHash(slice));
  }

@ -119,7 +214,6 @@ class HashLinkListRep : public MemTableRep {
    return (compare_(b, a) == 0);
  }

-
  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }

  bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
@ -137,8 +231,8 @@ class HashLinkListRep : public MemTableRep {

  class FullListIterator : public MemTableRep::Iterator {
   public:
-    explicit FullListIterator(FullList* list, Arena* arena)
-      : iter_(list), full_list_(list), arena_(arena) {}
+    explicit FullListIterator(MemtableSkipList* list, Arena* arena)
+        : iter_(list), full_list_(list), arena_(arena) {}

    virtual ~FullListIterator() {
    }
@ -189,22 +283,22 @@ class HashLinkListRep : public MemTableRep {
      iter_.SeekToLast();
    }
   private:
-    FullList::Iterator iter_;
+    MemtableSkipList::Iterator iter_;
    // To destruct with the iterator.
-    std::unique_ptr<FullList> full_list_;
+    std::unique_ptr<MemtableSkipList> full_list_;
    std::unique_ptr<Arena> arena_;
    std::string tmp_;       // For passing to EncodeKey
  };

-  class Iterator : public MemTableRep::Iterator {
+  class LinkListIterator : public MemTableRep::Iterator {
   public:
-    explicit Iterator(const HashLinkListRep* const hash_link_list_rep,
-                      Node* head) :
-        hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) {
-    }
+    explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
+                              Node* head)
+        : hash_link_list_rep_(hash_link_list_rep),
+          head_(head),
+          node_(nullptr) {}

-    virtual ~Iterator() {
-    }
+    virtual ~LinkListIterator() {}

    // Returns true iff the iterator is positioned at a valid node.
    virtual bool Valid() const {
@ -271,22 +365,68 @@ class HashLinkListRep : public MemTableRep {
    }
  };

-  class DynamicIterator : public HashLinkListRep::Iterator {
+  class DynamicIterator : public HashLinkListRep::LinkListIterator {
   public:
    explicit DynamicIterator(HashLinkListRep& memtable_rep)
-      : HashLinkListRep::Iterator(&memtable_rep, nullptr),
-        memtable_rep_(memtable_rep) {}
+        : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
+          memtable_rep_(memtable_rep) {}

    // Advance to the first entry with a key >= target
    virtual void Seek(const Slice& k, const char* memtable_key) {
      auto transformed = memtable_rep_.GetPrefix(k);
-      Reset(memtable_rep_.GetBucket(transformed));
-      HashLinkListRep::Iterator::Seek(k, memtable_key);
+      auto* bucket = memtable_rep_.GetBucket(transformed);
+
+      SkipListBucketHeader* skip_list_header =
+          memtable_rep_.GetSkipListBucketHeader(bucket);
+      if (skip_list_header != nullptr) {
+        // The bucket is organized as a skip list
+        if (!skip_list_iter_) {
+          skip_list_iter_.reset(
+              new MemtableSkipList::Iterator(&skip_list_header->skip_list));
+        } else {
+          skip_list_iter_->SetList(&skip_list_header->skip_list);
+        }
+        if (memtable_key != nullptr) {
+          skip_list_iter_->Seek(memtable_key);
+        } else {
+          IterKey encoded_key;
+          encoded_key.EncodeLengthPrefixedKey(k);
+          skip_list_iter_->Seek(encoded_key.GetKey().data());
+        }
+      } else {
+        // The bucket is organized as a linked list
+        skip_list_iter_.reset();
+        Reset(memtable_rep_.GetLinkListFirstNode(bucket));
+        HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
+      }
+    }
+
+    virtual bool Valid() const {
+      if (skip_list_iter_) {
+        return skip_list_iter_->Valid();
+      }
+      return HashLinkListRep::LinkListIterator::Valid();
+    }
+
+    virtual const char* key() const {
+      if (skip_list_iter_) {
+        return skip_list_iter_->key();
+      }
+      return HashLinkListRep::LinkListIterator::key();
+    }
+
+    virtual void Next() {
+      if (skip_list_iter_) {
+        skip_list_iter_->Next();
+      } else {
+        HashLinkListRep::LinkListIterator::Next();
+      }
    }

   private:
    // the underlying memtable
    const HashLinkListRep& memtable_rep_;
+    std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
  };

  class EmptyIterator : public MemTableRep::Iterator {
@ -312,12 +452,16 @@ class HashLinkListRep : public MemTableRep {

 HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
                                 Arena* arena, const SliceTransform* transform,
-                                 size_t bucket_size, size_t huge_page_tlb_size,
-                                 Logger* logger,
+                                 size_t bucket_size,
+                                 uint32_t threshold_use_skiplist,
+                                 size_t huge_page_tlb_size, Logger* logger,
                                 int bucket_entries_logging_threshold,
                                 bool if_log_bucket_dist_when_flash)
    : MemTableRep(arena),
      bucket_size_(bucket_size),
+      // Threshold to use skip list doesn't make sense if less than 3, so we
+      // force it to be minimum of 3 to simplify implementation.
+      threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
      transform_(transform),
      compare_(compare),
      logger_(logger),
@ -343,53 +487,161 @@ KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
  return static_cast<void*>(x);
 }

+SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
+    Pointer* first_next_pointer) const {
+  if (first_next_pointer == nullptr) {
+    return nullptr;
+  }
+  if (first_next_pointer->NoBarrier_Load() == nullptr) {
+    // Single entry bucket
+    return nullptr;
+  }
+  // Counting header
+  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+  if (header->IsSkipListBucket()) {
+    assert(header->num_entries > threshold_use_skiplist_);
+    auto* skip_list_bucket_header =
+        reinterpret_cast<SkipListBucketHeader*>(header);
+    assert(skip_list_bucket_header->Counting_header.next.NoBarrier_Load() ==
+           header);
+    return skip_list_bucket_header;
+  }
+  assert(header->num_entries <= threshold_use_skiplist_);
+  return nullptr;
+}
+
+Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
+  if (first_next_pointer == nullptr) {
+    return nullptr;
+  }
+  if (first_next_pointer->NoBarrier_Load() == nullptr) {
+    // Single entry bucket
+    return reinterpret_cast<Node*>(first_next_pointer);
+  }
+  // Counting header
+  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+  if (!header->IsSkipListBucket()) {
+    assert(header->num_entries <= threshold_use_skiplist_);
+    return reinterpret_cast<Node*>(header->next.NoBarrier_Load());
+  }
+  assert(header->num_entries > threshold_use_skiplist_);
+  return nullptr;
+}
+
 void HashLinkListRep::Insert(KeyHandle handle) {
  Node* x = static_cast<Node*>(handle);
  assert(!Contains(x->key));
  Slice internal_key = GetLengthPrefixedSlice(x->key);
  auto transformed = GetPrefix(internal_key);
  auto& bucket = buckets_[GetHash(transformed)];
-  Node* head = static_cast<Node*>(bucket.Acquire_Load());
+  Pointer* first_next_pointer = static_cast<Pointer*>(bucket.NoBarrier_Load());

-  if (!head) {
+  if (first_next_pointer == nullptr) {
+    // Case 1. empty bucket
    // NoBarrier_SetNext() suffices since we will add a barrier when
    // we publish a pointer to "x" in prev[i].
    x->NoBarrier_SetNext(nullptr);
-    bucket.Release_Store(static_cast<void*>(x));
+    bucket.Release_Store(x);
    return;
  }

-  Node* cur = head;
-  Node* prev = nullptr;
-  while (true) {
-    if (cur == nullptr) {
-      break;
-    }
-    Node* next = cur->Next();
-    // Make sure the lists are sorted.
-    // If x points to head_ or next points nullptr, it is trivially satisfied.
-    assert((cur == head) || (next == nullptr) ||
-           KeyIsAfterNode(next->key, cur));
-    if (KeyIsAfterNode(internal_key, cur)) {
-      // Keep searching in this list
-      prev = cur;
-      cur = next;
-    } else {
-      break;
+  BucketHeader* header = nullptr;
+  if (first_next_pointer->NoBarrier_Load() == nullptr) {
+    // Case 2. only one entry in the bucket
+    // Need to convert to a Counting bucket and turn to case 4.
+    Node* first = reinterpret_cast<Node*>(first_next_pointer);
+    // Need to add a bucket header.
+    // We have to first convert it to a bucket with header before inserting
+    // the new node. Otherwise, we might need to change next pointer of first.
+    // In that case, a reader might sees the next pointer is NULL and wrongly
+    // think the node is a bucket header.
+    auto* mem = arena_->AllocateAligned(sizeof(BucketHeader));
+    header = new (mem) BucketHeader(first, 1);
+    bucket.Release_Store(header);
+  } else {
+    header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+    if (header->IsSkipListBucket()) {
+      // Case 4. Bucket is already a skip list
+      assert(header->num_entries > threshold_use_skiplist_);
+      auto* skip_list_bucket_header =
+          reinterpret_cast<SkipListBucketHeader*>(header);
+      skip_list_bucket_header->Counting_header.num_entries++;
+      skip_list_bucket_header->skip_list.Insert(x->key);
+      return;
    }
  }

-  // Our data structure does not allow duplicate insertion
-  assert(cur == nullptr || !Equal(x->key, cur->key));
+  if (bucket_entries_logging_threshold_ > 0 &&
+      header->num_entries ==
+          static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
+    Info(logger_,
+         "HashLinkedList bucket %zu has more than %d "
+         "entries. Key to insert: %s",
+         GetHash(transformed), header->num_entries,
+         GetLengthPrefixedSlice(x->key).ToString(true).c_str());
+  }

-  // NoBarrier_SetNext() suffices since we will add a barrier when
-  // we publish a pointer to "x" in prev[i].
-  x->NoBarrier_SetNext(cur);
+  if (header->num_entries == threshold_use_skiplist_) {
+    // Case 3. number of entries reaches the threshold so need to convert to
+    // skip list.
+    LinkListIterator bucket_iter(
+        this, reinterpret_cast<Node*>(first_next_pointer->NoBarrier_Load()));
+    auto mem = arena_->AllocateAligned(sizeof(SkipListBucketHeader));
+    SkipListBucketHeader* new_skip_list_header = new (mem)
+        SkipListBucketHeader(compare_, arena_, header->num_entries + 1);
+    auto& skip_list = new_skip_list_header->skip_list;
+
+    // Add all current entries to the skip list
+    for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) {
+      skip_list.Insert(bucket_iter.key());
+    }

-  if (prev) {
-    prev->SetNext(x);
+    // insert the new entry
+    skip_list.Insert(x->key);
+    // Set the bucket
+    bucket.Release_Store(new_skip_list_header);
  } else {
-    bucket.Release_Store(static_cast<void*>(x));
+    // Case 5. Need to insert to the sorted linked list without changing the
+    // header.
+    Node* first = reinterpret_cast<Node*>(header->next.NoBarrier_Load());
+    assert(first != nullptr);
+    // Advance counter unless the bucket needs to be advanced to skip list.
+    // In that case, we need to make sure the previous count never exceeds
+    // threshold_use_skiplist_ to avoid readers to cast to wrong format.
+    header->num_entries++;
+
+    Node* cur = first;
+    Node* prev = nullptr;
+    while (true) {
+      if (cur == nullptr) {
+        break;
+      }
+      Node* next = cur->Next();
+      // Make sure the lists are sorted.
+      // If x points to head_ or next points nullptr, it is trivially satisfied.
+      assert((cur == first) || (next == nullptr) ||
+             KeyIsAfterNode(next->key, cur));
+      if (KeyIsAfterNode(internal_key, cur)) {
+        // Keep searching in this list
+        prev = cur;
+        cur = next;
+      } else {
+        break;
+      }
+    }
+
+    // Our data structure does not allow duplicate insertion
+    assert(cur == nullptr || !Equal(x->key, cur->key));
+
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(cur);
+
+    if (prev) {
+      prev->SetNext(x);
+    } else {
+      header->next.Release_Store(static_cast<void*>(x));
+    }
  }
 }

@ -401,7 +653,13 @@ bool HashLinkListRep::Contains(const char* key) const {
  if (bucket == nullptr) {
    return false;
  }
-  return BucketContains(bucket, internal_key);
+
+  SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
+  if (skip_list_header != nullptr) {
+    return skip_list_header->skip_list.Contains(key);
+  } else {
+    return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
+  }
 }

 size_t HashLinkListRep::ApproximateMemoryUsage() {
@ -413,37 +671,53 @@ void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
                          bool (*callback_func)(void* arg, const char* entry)) {
  auto transformed = transform_->Transform(k.user_key());
  auto bucket = GetBucket(transformed);
-  if (bucket != nullptr) {
-    Iterator iter(this, bucket);
-    for (iter.Seek(k.internal_key(), nullptr);
+
+  auto* skip_list_header = GetSkipListBucketHeader(bucket);
+  if (skip_list_header != nullptr) {
+    // Is a skip list
+    MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
+    for (iter.Seek(k.memtable_key().data());
         iter.Valid() && callback_func(callback_args, iter.key());
         iter.Next()) {
    }
+  } else {
+    auto* link_list_head = GetLinkListFirstNode(bucket);
+    if (link_list_head != nullptr) {
+      LinkListIterator iter(this, link_list_head);
+      for (iter.Seek(k.internal_key(), nullptr);
+           iter.Valid() && callback_func(callback_args, iter.key());
+           iter.Next()) {
+      }
+    }
  }
 }

 MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
  // allocate a new arena of similar size to the one currently in use
  Arena* new_arena = new Arena(arena_->BlockSize());
-  auto list = new FullList(compare_, new_arena);
+  auto list = new MemtableSkipList(compare_, new_arena);
  HistogramImpl keys_per_bucket_hist;

  for (size_t i = 0; i < bucket_size_; ++i) {
    int count = 0;
-    bool num_entries_printed = false;
-    auto bucket = GetBucket(i);
+    auto* bucket = GetBucket(i);
    if (bucket != nullptr) {
-      Iterator itr(this, bucket);
-      for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
-        list->Insert(itr.key());
-        if (logger_ != nullptr &&
-            ++count >= bucket_entries_logging_threshold_ &&
-            !num_entries_printed) {
-          num_entries_printed = true;
-          Info(logger_, "HashLinkedList bucket %zu has more than %d "
-               "entries. %dth key: %s",
-               i, count, count,
-               GetLengthPrefixedSlice(itr.key()).ToString(true).c_str());
+      auto* skip_list_header = GetSkipListBucketHeader(bucket);
+      if (skip_list_header != nullptr) {
+        // Is a skip list
+        MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
+        for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
+          list->Insert(itr.key());
+          count++;
+        }
+      } else {
+        auto* link_list_head = GetLinkListFirstNode(bucket);
+        if (link_list_head != nullptr) {
+          LinkListIterator itr(this, link_list_head);
+          for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
+            list->Insert(itr.key());
+            count++;
+          }
        }
      }
    }
@ -474,7 +748,8 @@ MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
  }
 }

-bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const {
+bool HashLinkListRep::LinkListContains(Node* head,
+                                       const Slice& user_key) const {
  Node* x = FindGreaterOrEqualInBucket(head, user_key);
  return (x != nullptr && Equal(user_key, x->key));
 }
@ -505,17 +780,19 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
 MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
    const MemTableRep::KeyComparator& compare, Arena* arena,
    const SliceTransform* transform, Logger* logger) {
-  return new HashLinkListRep(
-      compare, arena, transform, bucket_count_, huge_page_tlb_size_, logger,
-      bucket_entries_logging_threshold_, if_log_bucket_dist_when_flash_);
+  return new HashLinkListRep(compare, arena, transform, bucket_count_,
+                             threshold_use_skiplist_, huge_page_tlb_size_,
+                             logger, bucket_entries_logging_threshold_,
+                             if_log_bucket_dist_when_flash_);
 }

 MemTableRepFactory* NewHashLinkListRepFactory(
    size_t bucket_count, size_t huge_page_tlb_size,
-    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash) {
-  return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size,
-                                    bucket_entries_logging_threshold,
-                                    if_log_bucket_dist_when_flash);
+    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
+    uint32_t threshold_use_skiplist) {
+  return new HashLinkListRepFactory(
+      bucket_count, threshold_use_skiplist, huge_page_tlb_size,
+      bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
 }

 } // namespace rocksdb
--- a/util/hash_linklist_rep.h
+++ b/util/hash_linklist_rep.h
@ -16,10 +16,12 @@ namespace rocksdb {
 class HashLinkListRepFactory : public MemTableRepFactory {
 public:
  explicit HashLinkListRepFactory(size_t bucket_count,
+                                  uint32_t threshold_use_skiplist,
                                  size_t huge_page_tlb_size,
                                  int bucket_entries_logging_threshold,
                                  bool if_log_bucket_dist_when_flash)
      : bucket_count_(bucket_count),
+        threshold_use_skiplist_(threshold_use_skiplist),
        huge_page_tlb_size_(huge_page_tlb_size),
        bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
        if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {}
@ -36,6 +38,7 @@ class HashLinkListRepFactory : public MemTableRepFactory {

 private:
  const size_t bucket_count_;
+  const uint32_t threshold_use_skiplist_;
  const size_t huge_page_tlb_size_;
  int bucket_entries_logging_threshold_;
  bool if_log_bucket_dist_when_flash_;