From 9a9d4759b24695051a41f4f8da0442940e977393 Mon Sep 17 00:00:00 2001 From: Nathan Bronson Date: Thu, 19 Nov 2015 14:24:29 -0800 Subject: [PATCH] InlineSkipList part 3/3 - new skiplist type that colocates key and node Summary: This diff completes the creation of InlineSkipList, which is like SkipList but it always allocates the key contiguously with the node. This allows us to remove the pointer from the node to the key. As a result the memory usage of the skip list is reduced (by 1 to sizeof(void*) bytes depending on the padding required to align the key storage), cache locality is improved, and we halve the number of calls to the allocator. For skip lists whose keys are freshly-allocated const char*, InlineSkipList is stricly preferrable to SkipList. This diff doesn't replace SkipList, however, because some of the use cases of SkipList in RocksDB are either character sequences that are not allocated at the same time as the skip list node allocation (for example hash_linklist_rep) or have different key types (for example write_batch_with_index). Taking advantage of inline allocation for those cases is left to future work. The perf win is biggest for small values. For single-threaded CPU-bound (32M fillrandom operations with no WAL log) with 16 byte keys and 0 byte values, the db_bench perf goes from ~310k ops/sec to ~410k ops/sec. For large values the improvement is less pronounced, but seems to be between 5% and 10% on the same configuration. Test Plan: make check Reviewers: igor, sdong Reviewed By: sdong Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D51123 --- db/db_test.cc | 4 +- db/inlineskiplist.h | 146 +++++++++++++++++++++++++++++++------------- util/skiplistrep.cc | 22 ++++--- 3 files changed, 118 insertions(+), 54 deletions(-) diff --git a/db/db_test.cc b/db/db_test.cc index 57b82719a..1ce4b6e30 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2381,9 +2381,9 @@ TEST_F(DBTest, NumImmutableMemTable) { ASSERT_EQ(num, "0"); ASSERT_TRUE(dbfull()->GetProperty( handles_[1], "rocksdb.cur-size-active-mem-table", &num)); - // "200" is the size of the metadata of an empty skiplist, this would + // "192" is the size of the metadata of an empty skiplist, this would // break if we change the default skiplist implementation - ASSERT_EQ(num, "200"); + ASSERT_EQ(num, "192"); uint64_t int_num; uint64_t base_total_size; diff --git a/db/inlineskiplist.h b/db/inlineskiplist.h index e326640b3..a4fb5ab89 100644 --- a/db/inlineskiplist.h +++ b/db/inlineskiplist.h @@ -1,11 +1,22 @@ // Copyright (c) 2013, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. +// LICENSE file in the root directory of this source tree. An additional +// grant of patent rights can be found in the PATENTS file in the same +// directory. // -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. Use of +// this source code is governed by a BSD-style license that can be found +// in the LICENSE file. See the AUTHORS file for names of contributors. +// +// InlineSkipList is derived from SkipList (skiplist.h), but it optimizes +// the memory layout by requiring that the key storage be allocated through +// the skip list instance. For the common case of SkipList this saves 1 pointer per skip list node and gives better cache +// locality, at the expense of wasted padding from using AllocateAligned +// instead of Allocate for the keys. The unused padding will be from +// 0 to sizeof(void*)-1 bytes, and the space savings are sizeof(void*) +// bytes, so despite the padding the space used is always less than +// SkipList. // // Thread safety ------------- // @@ -30,8 +41,8 @@ #pragma once #include -#include #include +#include #include "port/port.h" #include "util/allocator.h" #include "util/random.h" @@ -52,10 +63,13 @@ class InlineSkipList { int32_t max_height = 12, int32_t branching_factor = 4); - // Allocates a key that can be passed to Insert. + // Allocates a key and a skip-list node, returning a pointer to the + // key portion of the node. char* AllocateKey(size_t key_size); - // Insert key into the list. + // Inserts a key allocated by AllocateKey, after the actual key value + // has been filled in. + // // REQUIRES: nothing that compares equal to key is currently in the list. void Insert(const char* key); @@ -135,14 +149,16 @@ class InlineSkipList { return max_height_.load(std::memory_order_relaxed); } - Node* NewNode(const char* key, int height); int RandomHeight(); + Node* AllocateNode(size_t key_size, int height); + bool Equal(const char* a, const char* b) const { return (compare_(a, b) == 0); } - // Return true if key is greater than the data stored in "n" + // Return true if key is greater than the data stored in "n". Null n + // is considered infinite. bool KeyIsAfterNode(const char* key, Node* n) const; // Returns the earliest node with a key >= key. @@ -161,54 +177,69 @@ class InlineSkipList { // No copying allowed InlineSkipList(const InlineSkipList&); - void operator=(const InlineSkipList&); + InlineSkipList& operator=(const InlineSkipList&); }; // Implementation details follow + +// The Node data type is more of a pointer into custom-managed memory than +// a traditional C++ struct. The key is stored in the bytes immediately +// after the struct, and the next_ pointers for nodes with height > 1 are +// stored immediately _before_ the struct. This avoids the need to include +// any pointer or sizing data, which reduces per-node memory overheads. template struct InlineSkipList::Node { - explicit Node(const char* k) : key(k) {} + // Stores the height of the node in the memory location normally used for + // next_[0]. This is used for passing data from AllocateKey to Insert. + void StashHeight(const int height) { + assert(sizeof(int) <= sizeof(next_[0])); + memcpy(&next_[0], &height, sizeof(int)); + } + + // Retrieves the value passed to StashHeight. Undefined after a call + // to SetNext or NoBarrier_SetNext. + int UnstashHeight() const { + int rv; + memcpy(&rv, &next_[0], sizeof(int)); + return rv; + } - const char* const key; + const char* Key() const { return reinterpret_cast(&next_[1]); } - // Accessors/mutators for links. Wrapped in methods so we can - // add the appropriate barriers as necessary. + // Accessors/mutators for links. Wrapped in methods so we can add + // the appropriate barriers as necessary, and perform the necessary + // addressing trickery for storing links below the Node in memory. Node* Next(int n) { assert(n >= 0); // Use an 'acquire load' so that we observe a fully initialized // version of the returned Node. - return (next_[n].load(std::memory_order_acquire)); + return (next_[-n].load(std::memory_order_acquire)); } + void SetNext(int n, Node* x) { assert(n >= 0); // Use a 'release store' so that anybody who reads through this // pointer observes a fully initialized version of the inserted node. - next_[n].store(x, std::memory_order_release); + next_[-n].store(x, std::memory_order_release); } // No-barrier variants that can be safely used in a few locations. Node* NoBarrier_Next(int n) { assert(n >= 0); - return next_[n].load(std::memory_order_relaxed); + return next_[-n].load(std::memory_order_relaxed); } + void NoBarrier_SetNext(int n, Node* x) { assert(n >= 0); - next_[n].store(x, std::memory_order_relaxed); + next_[-n].store(x, std::memory_order_relaxed); } private: - // Array of length equal to the node height. next_[0] is lowest level link. + // next_[0] is the lowest level link (level 0). Higher levels are + // stored _earlier_, so level 1 is at next_[-1]. std::atomic next_[1]; }; -template -typename InlineSkipList::Node* InlineSkipList::NewNode( - const char* key, int height) { - char* mem = allocator_->AllocateAligned( - sizeof(Node) + sizeof(std::atomic) * (height - 1)); - return new (mem) Node(key); -} - template inline InlineSkipList::Iterator::Iterator( const InlineSkipList* list) { @@ -230,7 +261,7 @@ inline bool InlineSkipList::Iterator::Valid() const { template inline const char* InlineSkipList::Iterator::key() const { assert(Valid()); - return node_->key; + return node_->Key(); } template @@ -244,7 +275,7 @@ inline void InlineSkipList::Iterator::Prev() { // Instead of using explicit "prev" links, we just search for the // last node that falls before key. assert(Valid()); - node_ = list_->FindLessThan(node_->key); + node_ = list_->FindLessThan(node_->Key()); if (node_ == list_->head_) { node_ = nullptr; } @@ -286,7 +317,7 @@ template bool InlineSkipList::KeyIsAfterNode(const char* key, Node* n) const { // nullptr n is considered infinite - return (n != nullptr) && (compare_(n->key, key) < 0); + return (n != nullptr) && (compare_(n->Key(), key) < 0); } template @@ -303,11 +334,12 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { while (true) { Node* next = x->Next(level); // Make sure the lists are sorted - assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x)); + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); // Make sure we haven't overshot during our search assert(x == head_ || KeyIsAfterNode(key, x)); - int cmp = - (next == nullptr || next == last_bigger) ? 1 : compare_(next->key, key); + int cmp = (next == nullptr || next == last_bigger) + ? 1 + : compare_(next->Key(), key); if (cmp == 0 || (cmp > 0 && level == 0)) { return next; } else if (cmp < 0) { @@ -330,7 +362,7 @@ InlineSkipList::FindLessThan(const char* key, Node** prev) const { Node* last_not_after = nullptr; while (true) { Node* next = x->Next(level); - assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x)); + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); assert(x == head_ || KeyIsAfterNode(key, x)); if (next != last_not_after && KeyIsAfterNode(key, next)) { // Keep searching in this list @@ -377,9 +409,9 @@ uint64_t InlineSkipList::EstimateCount(const char* key) const { Node* x = head_; int level = GetMaxHeight() - 1; while (true) { - assert(x == head_ || compare_(x->key, key) < 0); + assert(x == head_ || compare_(x->Key(), key) < 0); Node* next = x->Next(level); - if (next == nullptr || compare_(next->key, key) >= 0) { + if (next == nullptr || compare_(next->Key(), key) >= 0) { if (level == 0) { return count; } else { @@ -404,7 +436,7 @@ InlineSkipList::InlineSkipList(const Comparator cmp, kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_), compare_(cmp), allocator_(allocator), - head_(NewNode(0 /* any key will do */, max_height)), + head_(AllocateNode(0, max_height)), max_height_(1), prev_height_(1) { assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); @@ -424,7 +456,31 @@ InlineSkipList::InlineSkipList(const Comparator cmp, template char* InlineSkipList::AllocateKey(size_t key_size) { - return allocator_->Allocate(key_size); + return const_cast(AllocateNode(key_size, RandomHeight())->Key()); +} + +template +typename InlineSkipList::Node* +InlineSkipList::AllocateNode(size_t key_size, int height) { + auto prefix = sizeof(std::atomic) * (height - 1); + + // prefix is space for the height - 1 pointers that we store before + // the Node instance (next_[-(height - 1) .. -1]). Node starts at + // raw + prefix, and holds the bottom-mode (level 0) skip list pointer + // next_[0]. key_size is the bytes for the key, which comes just after + // the Node. + char* raw = allocator_->AllocateAligned(prefix + sizeof(Node) + key_size); + Node* x = reinterpret_cast(raw + prefix); + + // Once we've linked the node into the skip list we don't actually need + // to know its height, because we can implicitly use the fact that we + // traversed into a node at level h to known that h is a valid level + // for that node. We need to convey the height to the Insert step, + // however, so that it can perform the proper links. Since we're not + // using the pointers at the moment, StashHeight temporarily borrow + // storage from next_[0] for that purpose. + x->StashHeight(height); + return x; } template @@ -449,14 +505,17 @@ void InlineSkipList::Insert(const char* key) { } // Our data structure does not allow duplicate insertion - assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->key)); + assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->Key())); + + // Find the Node that we placed before the key in AllocateKey + Node* x = reinterpret_cast(const_cast(key)) - 1; + int height = x->UnstashHeight(); + assert(height >= 1 && height <= kMaxHeight_); - int height = RandomHeight(); if (height > GetMaxHeight()) { for (int i = GetMaxHeight(); i < height; i++) { prev_[i] = head_; } - // fprintf(stderr, "Change height from %d to %d\n", max_height_, height); // It is ok to mutate max_height_ without any synchronization // with concurrent readers. A concurrent reader that observes @@ -468,7 +527,6 @@ void InlineSkipList::Insert(const char* key) { max_height_.store(height, std::memory_order_relaxed); } - Node* x = NewNode(key, height); for (int i = 0; i < height; i++) { // NoBarrier_SetNext() suffices since we will add a barrier when // we publish a pointer to "x" in prev[i]. @@ -482,7 +540,7 @@ void InlineSkipList::Insert(const char* key) { template bool InlineSkipList::Contains(const char* key) const { Node* x = FindGreaterOrEqual(key); - if (x != nullptr && Equal(key, x->key)) { + if (x != nullptr && Equal(key, x->Key())) { return true; } else { return false; diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index 112a7ab12..83472c699 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -3,15 +3,15 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "rocksdb/memtablerep.h" +#include "db/inlineskiplist.h" #include "db/memtable.h" -#include "db/skiplist.h" +#include "rocksdb/memtablerep.h" #include "util/arena.h" namespace rocksdb { namespace { class SkipListRep : public MemTableRep { - SkipList skip_list_; + InlineSkipList skip_list_; const MemTableRep::KeyComparator& cmp_; const SliceTransform* transform_; const size_t lookahead_; @@ -25,6 +25,11 @@ public: transform_(transform), lookahead_(lookahead) { } + virtual KeyHandle Allocate(const size_t len, char** buf) override { + *buf = skip_list_.AllocateKey(len); + return static_cast(*buf); + } + // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. virtual void Insert(KeyHandle handle) override { @@ -65,13 +70,14 @@ public: // Iteration over the contents of a skip list class Iterator : public MemTableRep::Iterator { - SkipList::Iterator iter_; + InlineSkipList::Iterator iter_; + public: // Initialize an iterator over the specified list. // The returned iterator is not valid. explicit Iterator( - const SkipList* list - ) : iter_(list) { } + const InlineSkipList* list) + : iter_(list) {} virtual ~Iterator() override { } @@ -213,8 +219,8 @@ public: private: const SkipListRep& rep_; - SkipList::Iterator iter_; - SkipList::Iterator prev_; + InlineSkipList::Iterator iter_; + InlineSkipList::Iterator prev_; }; virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {