diff --git a/db/column_family.cc b/db/column_family.cc index 80417b71b..b4fe6c181 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -275,11 +275,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.max_compaction_bytes = result.target_file_size_base * 25; } - // Insert into memtable with hint is incompatible with concurrent inserts. - if (db_options.allow_concurrent_memtable_write) { - result.memtable_insert_with_hint_prefix_extractor = nullptr; - } - return result; } diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index ff6b8c320..4604e4991 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -559,9 +559,9 @@ TEST_F(DBPropertiesTest, NumImmutableMemTable) { ASSERT_EQ(num, "3"); ASSERT_TRUE(dbfull()->GetIntProperty( handles_[1], "rocksdb.cur-size-active-mem-table", &value)); - // "384" is the size of the metadata of two empty skiplists, this would + // "192" is the size of the metadata of two empty skiplists, this would // break if we change the default skiplist implementation - ASSERT_GE(value, 384); + ASSERT_GE(value, 192); uint64_t int_num; uint64_t base_total_size; diff --git a/db/inlineskiplist.h b/db/inlineskiplist.h index c31aa4aa5..43bb09ac8 100644 --- a/db/inlineskiplist.h +++ b/db/inlineskiplist.h @@ -54,13 +54,13 @@ namespace rocksdb { template class InlineSkipList { - public: - struct InsertHint; - private: struct Node; + struct Splice; public: + static const uint16_t kMaxPossibleHeight = 32; + // Create a new InlineSkipList object that will use "cmp" for comparing // keys, and will allocate memory using "*allocator". Objects allocated // in the allocator must remain allocated for the lifetime of the @@ -74,29 +74,49 @@ class InlineSkipList { // is thread-safe. char* AllocateKey(size_t key_size); + // Allocate a splice using allocator. + Splice* AllocateSplice(); + // Inserts a key allocated by AllocateKey, after the actual key value // has been filled in. // // REQUIRES: nothing that compares equal to key is currently in the list. - // REQUIRES: no concurrent calls to INSERT + // REQUIRES: no concurrent calls to any of inserts. void Insert(const char* key); - // Inserts a key allocated by AllocateKey with a hint. It can be used to - // optimize sequential inserts, or inserting a key close to the largest - // key inserted previously with the same hint. + // Inserts a key allocated by AllocateKey with a hint of last insert + // position in the skip-list. If hint points to nullptr, a new hint will be + // populated, which can be used in subsequent calls. // - // If hint points to nullptr, a new hint will be populated, which can be - // used in subsequent calls. + // It can be used to optimize the workload where there are multiple groups + // of keys, and each key is likely to insert to a location close to the last + // inserted key in the same group. One example is sequential inserts. // - // REQUIRES: All keys inserted with the same hint must be consecutive in the - // skip-list, i.e. let [k1..k2] be the range of keys inserted with hint h, - // there shouldn't be a key k in the skip-list with k1 < k < k2, unless k is - // also inserted with the same hint. - void InsertWithHint(const char* key, InsertHint** hint); + // REQUIRES: nothing that compares equal to key is currently in the list. + // REQUIRES: no concurrent calls to any of inserts. + void InsertWithHint(const char* key, void** hint); // Like Insert, but external synchronization is not required. void InsertConcurrently(const char* key); + // Inserts a node into the skip list. key must have been allocated by + // AllocateKey and then filled in by the caller. If UseCAS is true, + // then external synchronization is not required, otherwise this method + // may not be called concurrently with any other insertions. + // + // Regardless of whether UseCAS is true, the splice must be owned + // exclusively by the current thread. If allow_partial_splice_fix is + // true, then the cost of insertion is amortized O(log D), where D is + // the distance from the splice to the inserted key (measured as the + // number of intervening nodes). Note that this bound is very good for + // sequential insertions! If allow_partial_splice_fix is false then + // the existing splice will be ignored unless the current key is being + // inserted immediately after the splice. allow_partial_splice_fix == + // false has worse running time for the non-sequential case O(log N), + // but a better constant factor. + template + void Insert(const char* key, Splice* splice, bool allow_partial_splice_fix); + // Returns true iff an entry that compares equal to key is in the list. bool Contains(const char* key) const; @@ -154,8 +174,6 @@ class InlineSkipList { }; private: - static const uint16_t kMaxPossibleHeight = 32; - const uint16_t kMaxHeight_; const uint16_t kBranching_; const uint32_t kScaledInverseBranching_; @@ -170,13 +188,10 @@ class InlineSkipList { // values are ok. std::atomic max_height_; // Height of the entire list - // Used for optimizing sequential insert patterns. Tricky. prev_height_ - // of zero means prev_ is undefined. Otherwise: prev_[i] for i up - // to max_height_ - 1 (inclusive) is the predecessor of prev_[0], and - // prev_height_ is the height of prev_[0]. prev_[0] can only be equal - // to head when max_height_ and prev_height_ are both 1. - Node** prev_; - std::atomic prev_height_; + // seq_splice_ is a Splice used for insertions in the non-concurrent + // case. It caches the prev and next found during the most recent + // non-concurrent insertion. + Splice* seq_splice_; inline int GetMaxHeight() const { return max_height_.load(std::memory_order_relaxed); @@ -186,13 +201,6 @@ class InlineSkipList { Node* AllocateNode(size_t key_size, int height); - // Allocate a hint used by InsertWithHint(). - InsertHint* AllocateInsertHint(); - - // Extract the node from a key allocated by AllocateKey(), and populate - // height of the node. - Node* GetNodeForInsert(const char* key, int* height); - bool Equal(const char* a, const char* b) const { return (compare_(a, b) == 0); } @@ -202,7 +210,7 @@ class InlineSkipList { } // Return true if key is greater than the data stored in "n". Null n - // is considered infinite. + // is considered infinite. n should not be head_. bool KeyIsAfterNode(const char* key, Node* n) const; // Returns the earliest node with a key >= key. @@ -232,12 +240,13 @@ class InlineSkipList { // point to a node that is before the key, and after should point to // a node that is after the key. after should be nullptr if a good after // node isn't conveniently available. - void FindLevelSplice(const char* key, Node* before, Node* after, int level, - Node** out_prev, Node** out_next); + void FindSpliceForLevel(const char* key, Node* before, Node* after, int level, + Node** out_prev, Node** out_next); - // Check if we need to invalidate prev_ cache after inserting a node of - // given height. - void MaybeInvalidatePrev(int height); + // Recomputes Splice levels from highest_level (inclusive) down to + // lowest_level (inclusive). + void RecomputeSpliceLevels(const char* key, Splice* splice, + int recompute_level); // No copying allowed InlineSkipList(const InlineSkipList&); @@ -246,6 +255,19 @@ class InlineSkipList { // Implementation details follow +template +struct InlineSkipList::Splice { + // The invariant of a Splice is that prev_[i+1].key <= prev_[i].key < + // next_[i].key <= next_[i+1].key for all i. That means that if a + // key is bracketed by prev_[i] and next_[i] then it is bracketed by + // all higher levels. It is _not_ required that prev_[i]->Next(i) == + // next_[i] (it probably did at some point in the past, but intervening + // or concurrent operations might have inserted nodes in between). + int height_ = 0; + Node** prev_; + Node** next_; +}; + // The Node data type is more of a pointer into custom-managed memory than // a traditional C++ struct. The key is stored in the bytes immediately // after the struct, and the next_ pointers for nodes with height > 1 are @@ -317,17 +339,6 @@ struct InlineSkipList::Node { std::atomic next_[1]; }; -// -// -// Hint to insert position to speed-up inserts. See implementation of -// InsertWithHint() for more details. -template -struct InlineSkipList::InsertHint { - Node** prev; - uint8_t* prev_height; - int num_levels; -}; - template inline InlineSkipList::Iterator::Iterator( const InlineSkipList* list) { @@ -419,6 +430,7 @@ template bool InlineSkipList::KeyIsAfterNode(const char* key, Node* n) const { // nullptr n is considered infinite + assert(n != head_); return (n != nullptr) && (compare_(n->Key(), key) < 0); } @@ -549,19 +561,14 @@ InlineSkipList::InlineSkipList(const Comparator cmp, allocator_(allocator), head_(AllocateNode(0, max_height)), max_height_(1), - prev_height_(1) { + seq_splice_(AllocateSplice()) { assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); assert(branching_factor > 1 && kBranching_ == static_cast(branching_factor)); assert(kScaledInverseBranching_ > 0); - // Allocate the prev_ Node* array, directly from the passed-in allocator. - // prev_ does not need to be freed, as its life cycle is tied up with - // the allocator as a whole. - prev_ = reinterpret_cast( - allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); - for (int i = 0; i < kMaxHeight_; i++) { + + for (int i = 0; i < kMaxHeight_; ++i) { head_->SetNext(i, nullptr); - prev_[i] = head_; } } @@ -595,226 +602,49 @@ InlineSkipList::AllocateNode(size_t key_size, int height) { } template -typename InlineSkipList::InsertHint* -InlineSkipList::AllocateInsertHint() { - InsertHint* hint = reinterpret_cast( - allocator_->AllocateAligned(sizeof(InsertHint))); - // Allocate an extra level on kMaxHeight_, to make boundary cases easier to - // handle. - hint->prev = reinterpret_cast( - allocator_->AllocateAligned(sizeof(Node*) * (kMaxHeight_ + 1))); - hint->prev_height = reinterpret_cast( - allocator_->AllocateAligned(sizeof(uint8_t*) * kMaxHeight_)); - for (int i = 0; i <= kMaxHeight_; i++) { - hint->prev[i] = head_; - } - hint->num_levels = 0; - return hint; +typename InlineSkipList::Splice* +InlineSkipList::AllocateSplice() { + // size of prev_ and next_ + size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1); + char* raw = allocator_->AllocateAligned(sizeof(Splice) + array_size * 2); + Splice* splice = reinterpret_cast(raw); + splice->height_ = 0; + splice->prev_ = reinterpret_cast(raw + sizeof(Splice)); + splice->next_ = reinterpret_cast(raw + sizeof(Splice) + array_size); + return splice; } template -typename InlineSkipList::Node* -InlineSkipList::GetNodeForInsert(const char* key, int* height) { - // Find the Node that we placed before the key in AllocateKey - Node* x = reinterpret_cast(const_cast(key)) - 1; - assert(height != nullptr); - *height = x->UnstashHeight(); - assert(*height >= 1 && *height <= kMaxHeight_); - - if (*height > GetMaxHeight()) { - // It is ok to mutate max_height_ without any synchronization - // with concurrent readers. A concurrent reader that observes - // the new value of max_height_ will see either the old value of - // new level pointers from head_ (nullptr), or a new value set in - // the loop below. In the former case the reader will - // immediately drop to the next level since nullptr sorts after all - // keys. In the latter case the reader will use the new node. - max_height_.store(*height, std::memory_order_relaxed); - } - - return x; +void InlineSkipList::Insert(const char* key) { + Insert(key, seq_splice_, false); } template -void InlineSkipList::MaybeInvalidatePrev(int height) { - // We don't have a lock-free algorithm for updating prev_, but we do have - // the option of invalidating the entire sequential-insertion cache. - // prev_'s invariant is that prev_[i] (i > 0) is the predecessor of - // prev_[0] at that level. We're only going to violate that if height - // > 1 and key lands after prev_[height - 1] but before prev_[0]. - // Comparisons are pretty expensive, so an easier version is to just - // clear the cache if height > 1. We only write to prev_height_ if the - // nobody else has, to avoid invalidating the root of the skip list in - // all of the other CPU caches. - if (height > 1 && prev_height_.load(std::memory_order_relaxed) != 0) { - prev_height_.store(0, std::memory_order_relaxed); - } +void InlineSkipList::InsertConcurrently(const char* key) { + Node* prev[kMaxPossibleHeight]; + Node* next[kMaxPossibleHeight]; + Splice splice; + splice.prev_ = prev; + splice.next_ = next; + Insert(key, &splice, false); } template -void InlineSkipList::Insert(const char* key) { - // InsertConcurrently often can't maintain the prev_ invariants, so - // it just sets prev_height_ to zero, letting us know that we should - // ignore it. A relaxed load suffices here because write thread - // synchronization separates Insert calls from InsertConcurrently calls. - auto prev_height = prev_height_.load(std::memory_order_relaxed); - - // fast path for sequential insertion - if (prev_height > 0 && !KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0)) && - (prev_[0] == head_ || KeyIsAfterNode(key, prev_[0]))) { - assert(prev_[0] != head_ || (prev_height == 1 && GetMaxHeight() == 1)); - - // Outside of this method prev_[1..max_height_] is the predecessor - // of prev_[0], and prev_height_ refers to prev_[0]. Inside Insert - // prev_[0..max_height - 1] is the predecessor of key. Switch from - // the external state to the internal - for (int i = 1; i < prev_height; i++) { - prev_[i] = prev_[0]; - } - } else { - // TODO(opt): we could use a NoBarrier predecessor search as an - // optimization for architectures where memory_order_acquire needs - // a synchronization instruction. Doesn't matter on x86 - FindLessThan(key, prev_); - } - - // Our data structure does not allow duplicate insertion - assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->Key())); - - int height = 0; - Node* x = GetNodeForInsert(key, &height); - - for (int i = 0; i < height; i++) { - x->InsertAfter(prev_[i], i); - } - prev_[0] = x; - prev_height_.store(height, std::memory_order_relaxed); -} - -// The goal here is to reduce the number of key comparisons, as it can be -// expensive. We maintain a hint which help us to find a insert position -// between or next to previously inserted keys with the same hint. -// Note that we require all keys inserted with the same hint are consecutive -// in the skip-list. -// -// The hint keeps a list of nodes previous inserted with the same hint: -// * The first level, prev[0], points to the largest key of them. -// * For 0 < i < num_levels, prev[i] is the previous node of prev[i-1] -// on level i, i.e. -// prev[i] < prev[i-1] <= prev[i]->Next(i) -// (prev[i-1] and prev[i]->Next(i) could be the same node.) -// In addition prev_height keeps the height of prev[i]. -// -// When inserting a new key, we look for the lowest level L where -// prev[L] < key < prev[L-1]. Let -// M = max(prev_height[i]..prev_height[num_levels-1]) -// For each level between in [L, M), the previous node of -// the new key must be one of prev[i]. For levels below L and above M -// we do normal skip-list search if needed. -// -// The optimization is suitable for stream of keys where new inserts are next -// to or close to the largest key ever inserted, e.g. sequential inserts. -template -void InlineSkipList::InsertWithHint(const char* key, - InsertHint** hint_ptr) { - int height = 0; - Node* x = GetNodeForInsert(key, &height); - - // InsertWithHint() is not compatible with prev_ optimization used by - // Insert(). - MaybeInvalidatePrev(height); - - assert(hint_ptr != nullptr); - InsertHint* hint = *hint_ptr; - if (hint == nullptr) { - // AllocateInsertHint will initialize hint with num_levels = 0 and - // prev[i] = head_ for all i. - hint = AllocateInsertHint(); - *hint_ptr = hint; - } - - // Look for the first level i < num_levels with prev[i] < key. - int level = 0; - for (; level < hint->num_levels; level++) { - if (KeyIsAfterNode(key, hint->prev[level])) { - assert(!KeyIsAfterNode(key, hint->prev[level]->Next(level))); - break; - } - } - Node* tmp_prev[kMaxPossibleHeight]; - if (level >= hint->num_levels) { - // The hint is not useful in this case. Fallback to full search. - FindLessThan(key, tmp_prev); - for (int i = 0; i < height; i++) { - assert(tmp_prev[i] == head_ || KeyIsAfterNode(key, tmp_prev[i])); - assert(!KeyIsAfterNode(key, tmp_prev[i]->Next(i))); - x->InsertAfter(tmp_prev[i], i); - } - } else { - // Search on levels below "level", using prev[level] as root. - if (level > 0) { - FindLessThan(key, tmp_prev, hint->prev[level], level, 0); - for (int i = 0; i < level && i < height; i++) { - assert(tmp_prev[i] == head_ || KeyIsAfterNode(key, tmp_prev[i])); - assert(!KeyIsAfterNode(key, tmp_prev[i]->Next(i))); - x->InsertAfter(tmp_prev[i], i); - } - } - // The current level where the new node is to insert into skip-list. - int current_level = level; - for (int i = level; i < hint->num_levels; i++) { - while (current_level < height && current_level < hint->prev_height[i]) { - // In this case, prev[i] is the previous node of key on current_level, - // since: - // * prev[i] < key; - // * no other nodes less than prev[level-1] has height greater than - // current_level, and prev[level-1] > key. - assert(KeyIsAfterNode(key, hint->prev[i])); - assert(!KeyIsAfterNode(key, hint->prev[i]->Next(current_level))); - x->InsertAfter(hint->prev[i], current_level); - current_level++; - } - } - // Full search on levels above current_level if needed. - if (current_level < height) { - FindLessThan(key, tmp_prev, head_, GetMaxHeight(), current_level); - for (int i = current_level; i < height; i++) { - assert(tmp_prev[i] == head_ || KeyIsAfterNode(key, tmp_prev[i])); - assert(!KeyIsAfterNode(key, tmp_prev[i]->Next(i))); - x->InsertAfter(tmp_prev[i], i); - } - } - } - // The last step is update the new node into the hint. - // * If "height" <= "level", prev[level] is still the previous node of - // prev[level-1] on level "level". Stop. - // * Otherwise, the new node becomes the new previous node of - // prev[level-1], or if level=0, the new node becomes the largest node - // inserted with the same hint. Replace prev[level] with the new node. - // * If prev[i] is replaced by another node, check if it can replace - // prev[i+1] using a similar rule, up till "num_levels" level. - Node* p = x; - uint8_t h = static_cast(height); - for (int i = level; i < hint->num_levels; i++) { - if (h <= i) { - p = nullptr; - break; - } - std::swap(p, hint->prev[i]); - std::swap(h, hint->prev_height[i]); - } - if (p != nullptr && h > hint->num_levels) { - hint->prev[hint->num_levels] = p; - hint->prev_height[hint->num_levels] = h; - hint->num_levels++; +void InlineSkipList::InsertWithHint(const char* key, void** hint) { + assert(hint != nullptr); + Splice* splice = reinterpret_cast(*hint); + if (splice == nullptr) { + splice = AllocateSplice(); + *hint = reinterpret_cast(splice); } + Insert(key, splice, true); } template -void InlineSkipList::FindLevelSplice(const char* key, Node* before, - Node* after, int level, - Node** out_prev, - Node** out_next) { +void InlineSkipList::FindSpliceForLevel(const char* key, + Node* before, Node* after, + int level, Node** out_prev, + Node** out_next) { while (true) { Node* next = before->Next(level); assert(before == head_ || next == nullptr || @@ -831,15 +661,28 @@ void InlineSkipList::FindLevelSplice(const char* key, Node* before, } template -void InlineSkipList::InsertConcurrently(const char* key) { +void InlineSkipList::RecomputeSpliceLevels(const char* key, + Splice* splice, + int recompute_level) { + assert(recompute_level > 0); + assert(recompute_level <= splice->height_); + for (int i = recompute_level - 1; i >= 0; --i) { + FindSpliceForLevel(key, splice->prev_[i + 1], splice->next_[i + 1], i, + &splice->prev_[i], &splice->next_[i]); + } +} + +template +template +void InlineSkipList::Insert(const char* key, Splice* splice, + bool allow_partial_splice_fix) { Node* x = reinterpret_cast(const_cast(key)) - 1; int height = x->UnstashHeight(); assert(height >= 1 && height <= kMaxHeight_); - MaybeInvalidatePrev(height); int max_height = max_height_.load(std::memory_order_relaxed); while (height > max_height) { - if (max_height_.compare_exchange_strong(max_height, height)) { + if (max_height_.compare_exchange_weak(max_height, height)) { // successfully updated it max_height = height; break; @@ -849,28 +692,159 @@ void InlineSkipList::InsertConcurrently(const char* key) { } assert(max_height <= kMaxPossibleHeight); - Node* prev[kMaxPossibleHeight + 1]; - Node* next[kMaxPossibleHeight + 1]; - prev[max_height] = head_; - next[max_height] = nullptr; - for (int i = max_height - 1; i >= 0; --i) { - FindLevelSplice(key, prev[i + 1], next[i + 1], i, &prev[i], &next[i]); - } - for (int i = 0; i < height; ++i) { - while (true) { - x->NoBarrier_SetNext(i, next[i]); - if (prev[i]->CASNext(i, next[i], x)) { - // success + int recompute_height = 0; + if (splice->height_ < max_height) { + // Either splice has never been used or max_height has grown since + // last use. We could potentially fix it in the latter case, but + // that is tricky. + splice->prev_[max_height] = head_; + splice->next_[max_height] = nullptr; + splice->height_ = max_height; + recompute_height = max_height; + } else { + // Splice is a valid proper-height splice that brackets some + // key, but does it bracket this one? We need to validate it and + // recompute a portion of the splice (levels 0..recompute_height-1) + // that is a superset of all levels that don't bracket the new key. + // Several choices are reasonable, because we have to balance the work + // saved against the extra comparisons required to validate the Splice. + // + // One strategy is just to recompute all of orig_splice_height if the + // bottom level isn't bracketing. This pessimistically assumes that + // we will either get a perfect Splice hit (increasing sequential + // inserts) or have no locality. + // + // Another strategy is to walk up the Splice's levels until we find + // a level that brackets the key. This strategy lets the Splice + // hint help for other cases: it turns insertion from O(log N) into + // O(log D), where D is the number of nodes in between the key that + // produced the Splice and the current insert (insertion is aided + // whether the new key is before or after the splice). If you have + // a way of using a prefix of the key to map directly to the closest + // Splice out of O(sqrt(N)) Splices and we make it so that splices + // can also be used as hints during read, then we end up with Oshman's + // and Shavit's SkipTrie, which has O(log log N) lookup and insertion + // (compare to O(log N) for skip list). + // + // We control the pessimistic strategy with allow_partial_splice_fix. + // A good strategy is probably to be pessimistic for seq_splice_, + // optimistic if the caller actually went to the work of providing + // a Splice. + while (recompute_height < max_height) { + if (splice->prev_[recompute_height]->Next(recompute_height) != + splice->next_[recompute_height]) { + // splice isn't tight at this level, there must have been some inserts + // to this + // location that didn't update the splice. We might only be a little + // stale, but if + // the splice is very stale it would be O(N) to fix it. We haven't used + // up any of + // our budget of comparisons, so always move up even if we are + // pessimistic about + // our chances of success. + ++recompute_height; + } else if (splice->prev_[recompute_height] != head_ && + !KeyIsAfterNode(key, splice->prev_[recompute_height])) { + // key is from before splice + if (allow_partial_splice_fix) { + // skip all levels with the same node without more comparisons + Node* bad = splice->prev_[recompute_height]; + while (splice->prev_[recompute_height] == bad) { + ++recompute_height; + } + } else { + // we're pessimistic, recompute everything + recompute_height = max_height; + } + } else if (KeyIsAfterNode(key, splice->next_[recompute_height])) { + // key is from after splice + if (allow_partial_splice_fix) { + Node* bad = splice->next_[recompute_height]; + while (splice->next_[recompute_height] == bad) { + ++recompute_height; + } + } else { + recompute_height = max_height; + } + } else { + // this level brackets the key, we won! break; } - // CAS failed, we need to recompute prev and next. It is unlikely - // to be helpful to try to use a different level as we redo the - // search, because it should be unlikely that lots of nodes have - // been inserted between prev[i] and next[i]. No point in using - // next[i] as the after hint, because we know it is stale. - FindLevelSplice(key, prev[i], nullptr, i, &prev[i], &next[i]); } } + assert(recompute_height <= max_height); + if (recompute_height > 0) { + RecomputeSpliceLevels(key, splice, recompute_height); + } + + bool splice_is_valid = true; + if (UseCAS) { + for (int i = 0; i < height; ++i) { + while (true) { + assert(splice->next_[i] == nullptr || + compare_(x->Key(), splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), x->Key()) < 0); + x->NoBarrier_SetNext(i, splice->next_[i]); + if (splice->prev_[i]->CASNext(i, splice->next_[i], x)) { + // success + break; + } + // CAS failed, we need to recompute prev and next. It is unlikely + // to be helpful to try to use a different level as we redo the + // search, because it should be unlikely that lots of nodes have + // been inserted between prev[i] and next[i]. No point in using + // next[i] as the after hint, because we know it is stale. + FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i], + &splice->next_[i]); + + // Since we've narrowed the bracket for level i, we might have + // violated the Splice constraint between i and i-1. Make sure + // we recompute the whole thing next time. + if (i > 0) { + splice_is_valid = false; + } + } + } + } else { + for (int i = 0; i < height; ++i) { + if (i >= recompute_height && + splice->prev_[i]->Next(i) != splice->next_[i]) { + FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i], + &splice->next_[i]); + } + assert(splice->next_[i] == nullptr || + compare_(x->Key(), splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), x->Key()) < 0); + assert(splice->prev_[i]->Next(i) == splice->next_[i]); + x->NoBarrier_SetNext(i, splice->next_[i]); + splice->prev_[i]->SetNext(i, x); + } + } + if (splice_is_valid) { + for (int i = 0; i < height; ++i) { + splice->prev_[i] = x; + } + assert(splice->prev_[splice->height_] == head_); + assert(splice->next_[splice->height_] == nullptr); + for (int i = 0; i < splice->height_; ++i) { + assert(splice->next_[i] == nullptr || + compare_(key, splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), key) <= 0); + assert(splice->prev_[i + 1] == splice->prev_[i] || + splice->prev_[i + 1] == head_ || + compare_(splice->prev_[i + 1]->Key(), splice->prev_[i]->Key()) < + 0); + assert(splice->next_[i + 1] == splice->next_[i] || + splice->next_[i + 1] == nullptr || + compare_(splice->next_[i]->Key(), splice->next_[i + 1]->Key()) < + 0); + } + } else { + splice->height_ = 0; + } } template diff --git a/db/inlineskiplist_test.cc b/db/inlineskiplist_test.cc index 658ca4364..a06c49898 100644 --- a/db/inlineskiplist_test.cc +++ b/db/inlineskiplist_test.cc @@ -54,8 +54,7 @@ class InlineSkipTest : public testing::Test { keys_.insert(key); } - void InsertWithHint(TestInlineSkipList* list, Key key, - TestInlineSkipList::InsertHint** hint) { + void InsertWithHint(TestInlineSkipList* list, Key key, void** hint) { char* buf = list->AllocateKey(sizeof(Key)); memcpy(buf, &key, sizeof(Key)); list->InsertWithHint(buf, hint); @@ -201,7 +200,7 @@ TEST_F(InlineSkipTest, InsertWithHint_Sequential) { Arena arena; TestComparator cmp; TestInlineSkipList list(cmp, &arena); - TestInlineSkipList::InsertHint* hint = nullptr; + void* hint = nullptr; for (int i = 0; i < N; i++) { Key key = i; InsertWithHint(&list, key, &hint); @@ -216,7 +215,7 @@ TEST_F(InlineSkipTest, InsertWithHint_MultipleHints) { Arena arena; TestComparator cmp; TestInlineSkipList list(cmp, &arena); - TestInlineSkipList::InsertHint* hints[S]; + void* hints[S]; Key last_key[S]; for (int i = 0; i < S; i++) { hints[i] = nullptr; @@ -237,7 +236,7 @@ TEST_F(InlineSkipTest, InsertWithHint_MultipleHintsRandom) { Arena arena; TestComparator cmp; TestInlineSkipList list(cmp, &arena); - TestInlineSkipList::InsertHint* hints[S]; + void* hints[S]; for (int i = 0; i < S; i++) { hints[i] = nullptr; } @@ -260,7 +259,7 @@ TEST_F(InlineSkipTest, InsertWithHint_CompatibleWithInsertWithoutHint) { std::unordered_set used; Key with_hint[S1]; Key without_hint[S2]; - TestInlineSkipList::InsertHint* hints[S1]; + void* hints[S1]; for (int i = 0; i < S1; i++) { hints[i] = nullptr; while (true) { diff --git a/db/memtable.cc b/db/memtable.cc index 6de1bfff9..ef6356d3b 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -441,16 +441,12 @@ void MemTable::Add(SequenceNumber s, ValueType type, assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); if (!allow_concurrent) { // Extract prefix for insert with hint. - Slice prefix; - if (insert_with_hint_prefix_extractor_ != nullptr) { - if (insert_with_hint_prefix_extractor_->InDomain(key_slice)) { - prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); - } - } - if (prefix.empty()) { - table->Insert(handle); - } else { + if (insert_with_hint_prefix_extractor_ != nullptr && + insert_with_hint_prefix_extractor_->InDomain(key_slice)) { + Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); table->InsertWithHint(handle, &insert_hints_[prefix]); + } else { + table->Insert(handle); } // this is a bit ugly, but is the way to avoid locked instructions diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 31280bbf2..3ae89a980 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -83,9 +83,12 @@ class MemTableRep { // collection, and no concurrent modifications to the table in progress virtual void Insert(KeyHandle handle) = 0; - // Same as Insert(), but in additional pass a hint to optimize sequential - // inserts. A new hint will be return from the hint pointer. Caller can get - // an initial hint by passing hint pointing to nullptr. + // Same as Insert(), but in additional pass a hint to insert location for + // the key. If hint points to nullptr, a new hint will be populated. + // otherwise the hint will be updated to reflect the last insert location. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to Insert() by default. virtual void InsertWithHint(KeyHandle handle, void** hint) { // Ignore the hint by default. Insert(handle); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 8906c5fbd..7e6657793 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -747,24 +747,22 @@ struct ColumnFamilyOptions { size_t memtable_huge_page_size; // If non-nullptr, memtable will use the specified function to extract - // prefixes for keys, and for each non-empty prefix maintain a hint to - // reduce CPU usage for inserting keys with the prefix. Keys with empty - // prefix will be insert without using a hint. + // prefixes for keys, and for each prefix maintain a hint of insert location + // to reduce CPU usage for inserting keys with the prefix. Keys out of + // domain of the prefix extractor will be insert without using hints. // // Currently only the default skiplist based memtable implements the feature. - // All other memtable implementation will ignore the option. It incurs ~150 + // All other memtable implementation will ignore the option. It incurs ~250 // additional bytes of memory overhead to store a hint for each prefix. - // If allow_concurrent_memtable_write is true, the option will also be - // ignored. - // - // The option is best suited for sequential inserts, or inserts that's - // almost sequential. One scenario could be inserting keys of the form - // (prefix + timestamp), and keys of the same prefix always comes in - // with time order, or in some cases a key with a smaller timestamp comes - // in later due to network latency. - // - // REQUIRES: If custom comparator is provided, it has to make sure keys - // with the same prefix appear in consecutive range. + // Also concurrent writes (when allow_concurrent_memtable_write is true) will + // ignore the option. + // + // The option is best suited for workloads where keys will likely to insert + // to a location close the the last inserted key with the same prefix. + // One example could be inserting keys of the form (prefix + timestamp), + // and keys of the same prefix always comes in with time order. Another + // example would be updating the same key over and over again, in which case + // the prefix can be the key itself. // // Default: nullptr (disable) std::shared_ptr diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 59ca2af7c..d07fa1218 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -37,9 +37,7 @@ public: } virtual void InsertWithHint(KeyHandle handle, void** hint) override { - skip_list_.InsertWithHint( - static_cast(handle), - reinterpret_cast(hint)); + skip_list_.InsertWithHint(static_cast(handle), hint); } virtual void InsertConcurrently(KeyHandle handle) override {