Fix a bug in hash linked list (#10401)

Summary:
In hash linked list, with a bucket of only one record, following sequence can cause users to temporarily miss a record:

Thread 1: Fetch the structure bucket x points too, which would be a Node n1 for a key, with next pointer to be null
Thread 2: Insert a key to bucket x that is larger than the existing key. This will make n1->next points to a new node n2, and update bucket x to point to n1.
Thread 1: see n1->next is not null, so it thinks it is a header of linked list and ignore the key of n1.

Fix it by refetch structure that bucket x points to when it sees n1->next is not null. This should work because if n1->next is not null, bucket x should already point to a linked list or skip list header.

A related change is to revert th order of testing for linked list and skip list. This is because after refetching the bucket, it might end up with a skip list, rather than linked list.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10401

Test Plan: Run existing tests and make sure at least it doesn't regress.

Reviewed By: jay-zhuang

Differential Revision: D38064471

fbshipit-source-id: 142bb85e1546c803f47e3357aef3e76debccd8df
main
sdong 2 years ago committed by Facebook GitHub Bot
parent 6a160e1fec
commit 4e00748098
  1. 1
      HISTORY.md
  2. 178
      memtable/hash_linklist_rep.cc

@ -13,6 +13,7 @@
* Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object.
* Fix race conditions in `GenericRateLimiter`.
* Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow
* Fix data race bug in hash linked list memtable. With this bug, read request might temporarily miss an old record in the memtable in a race condition to the hash bucket.
## 7.5.0 (07/15/2022)
### New Features

@ -209,10 +209,16 @@ class HashLinkListRep : public MemTableRep {
bool LinkListContains(Node* head, const Slice& key) const;
SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
const;
bool IsEmptyBucket(Pointer& bucket_pointer) const {
return bucket_pointer.load(std::memory_order_acquire) == nullptr;
}
// Precondition: GetLinkListFirstNode() must have been called first and return
// null so that it must be a skip list bucket
SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const;
Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
// Returning nullptr indicates it is a skip list bucket.
Node* GetLinkListFirstNode(Pointer& bucket_pointer) const;
Slice GetPrefix(const Slice& internal_key) const {
return transform_->Transform(ExtractUserKey(internal_key));
@ -222,11 +228,9 @@ class HashLinkListRep : public MemTableRep {
return GetSliceRangedNPHash(slice, bucket_size_);
}
Pointer* GetBucket(size_t i) const {
return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
}
Pointer& GetBucket(size_t i) const { return buckets_[i]; }
Pointer* GetBucket(const Slice& slice) const {
Pointer& GetBucket(const Slice& slice) const {
return GetBucket(GetHash(slice));
}
@ -414,30 +418,39 @@ class HashLinkListRep : public MemTableRep {
// Advance to the first entry with a key >= target
void Seek(const Slice& k, const char* memtable_key) override {
auto transformed = memtable_rep_.GetPrefix(k);
auto* bucket = memtable_rep_.GetBucket(transformed);
SkipListBucketHeader* skip_list_header =
memtable_rep_.GetSkipListBucketHeader(bucket);
if (skip_list_header != nullptr) {
// The bucket is organized as a skip list
if (!skip_list_iter_) {
skip_list_iter_.reset(
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
} else {
skip_list_iter_->SetList(&skip_list_header->skip_list);
}
if (memtable_key != nullptr) {
skip_list_iter_->Seek(memtable_key);
Pointer& bucket = memtable_rep_.GetBucket(transformed);
if (memtable_rep_.IsEmptyBucket(bucket)) {
skip_list_iter_.reset();
Reset(nullptr);
} else {
Node* first_linked_list_node =
memtable_rep_.GetLinkListFirstNode(bucket);
if (first_linked_list_node != nullptr) {
// The bucket is organized as a linked list
skip_list_iter_.reset();
Reset(first_linked_list_node);
HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
} else {
IterKey encoded_key;
encoded_key.EncodeLengthPrefixedKey(k);
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
SkipListBucketHeader* skip_list_header =
memtable_rep_.GetSkipListBucketHeader(bucket);
assert(skip_list_header != nullptr);
// The bucket is organized as a skip list
if (!skip_list_iter_) {
skip_list_iter_.reset(
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
} else {
skip_list_iter_->SetList(&skip_list_header->skip_list);
}
if (memtable_key != nullptr) {
skip_list_iter_->Seek(memtable_key);
} else {
IterKey encoded_key;
encoded_key.EncodeLengthPrefixedKey(k);
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
}
}
} else {
// The bucket is organized as a linked list
skip_list_iter_.reset();
Reset(memtable_rep_.GetLinkListFirstNode(bucket));
HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
}
}
@ -528,36 +541,38 @@ KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
}
SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
Pointer* first_next_pointer) const {
if (first_next_pointer == nullptr) {
return nullptr;
}
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
// Single entry bucket
return nullptr;
}
Pointer& bucket_pointer) const {
Pointer* first_next_pointer =
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
assert(first_next_pointer != nullptr);
assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr);
// Counting header
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
if (header->IsSkipListBucket()) {
assert(header->GetNumEntries() > threshold_use_skiplist_);
auto* skip_list_bucket_header =
reinterpret_cast<SkipListBucketHeader*>(header);
assert(skip_list_bucket_header->Counting_header.next.load(
std::memory_order_relaxed) == header);
return skip_list_bucket_header;
}
assert(header->GetNumEntries() <= threshold_use_skiplist_);
return nullptr;
assert(header->IsSkipListBucket());
assert(header->GetNumEntries() > threshold_use_skiplist_);
auto* skip_list_bucket_header =
reinterpret_cast<SkipListBucketHeader*>(header);
assert(skip_list_bucket_header->Counting_header.next.load(
std::memory_order_relaxed) == header);
return skip_list_bucket_header;
}
Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
if (first_next_pointer == nullptr) {
return nullptr;
}
Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const {
Pointer* first_next_pointer =
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
assert(first_next_pointer != nullptr);
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
// Single entry bucket
return reinterpret_cast<Node*>(first_next_pointer);
}
// It is possible that after we fetch first_next_pointer it is modified
// and the next is not null anymore. In this case, the bucket should have been
// modified to a counting header, so we should reload the first_next_pointer
// to make sure we see the update.
first_next_pointer =
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
// Counting header
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
if (!header->IsSkipListBucket()) {
@ -695,17 +710,21 @@ bool HashLinkListRep::Contains(const char* key) const {
Slice internal_key = GetLengthPrefixedSlice(key);
auto transformed = GetPrefix(internal_key);
auto bucket = GetBucket(transformed);
if (bucket == nullptr) {
Pointer& bucket = GetBucket(transformed);
if (IsEmptyBucket(bucket)) {
return false;
}
Node* linked_list_node = GetLinkListFirstNode(bucket);
if (linked_list_node != nullptr) {
return LinkListContains(linked_list_node, internal_key);
}
SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
if (skip_list_header != nullptr) {
return skip_list_header->skip_list.Contains(key);
} else {
return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
}
return false;
}
size_t HashLinkListRep::ApproximateMemoryUsage() {
@ -716,21 +735,25 @@ size_t HashLinkListRep::ApproximateMemoryUsage() {
void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry)) {
auto transformed = transform_->Transform(k.user_key());
auto bucket = GetBucket(transformed);
Pointer& bucket = GetBucket(transformed);
auto* skip_list_header = GetSkipListBucketHeader(bucket);
if (skip_list_header != nullptr) {
// Is a skip list
MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
for (iter.Seek(k.memtable_key().data());
if (IsEmptyBucket(bucket)) {
return;
}
auto* link_list_head = GetLinkListFirstNode(bucket);
if (link_list_head != nullptr) {
LinkListIterator iter(this, link_list_head);
for (iter.Seek(k.internal_key(), nullptr);
iter.Valid() && callback_func(callback_args, iter.key());
iter.Next()) {
}
} else {
auto* link_list_head = GetLinkListFirstNode(bucket);
if (link_list_head != nullptr) {
LinkListIterator iter(this, link_list_head);
for (iter.Seek(k.internal_key(), nullptr);
auto* skip_list_header = GetSkipListBucketHeader(bucket);
if (skip_list_header != nullptr) {
// Is a skip list
MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
for (iter.Seek(k.memtable_key().data());
iter.Valid() && callback_func(callback_args, iter.key());
iter.Next()) {
}
@ -746,25 +769,24 @@ MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
for (size_t i = 0; i < bucket_size_; ++i) {
int count = 0;
auto* bucket = GetBucket(i);
if (bucket != nullptr) {
auto* skip_list_header = GetSkipListBucketHeader(bucket);
if (skip_list_header != nullptr) {
Pointer& bucket = GetBucket(i);
if (!IsEmptyBucket(bucket)) {
auto* link_list_head = GetLinkListFirstNode(bucket);
if (link_list_head != nullptr) {
LinkListIterator itr(this, link_list_head);
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
list->Insert(itr.key());
count++;
}
} else {
auto* skip_list_header = GetSkipListBucketHeader(bucket);
assert(skip_list_header != nullptr);
// Is a skip list
MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
list->Insert(itr.key());
count++;
}
} else {
auto* link_list_head = GetLinkListFirstNode(bucket);
if (link_list_head != nullptr) {
LinkListIterator itr(this, link_list_head);
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
list->Insert(itr.key());
count++;
}
}
}
}
if (if_log_bucket_dist_when_flash_) {

Loading…
Cancel
Save