From 3f7e929865d8c1c923915359c4ee2efb7987869c Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Mon, 2 Aug 2021 18:10:57 -0700 Subject: [PATCH] Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605) Summary: The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk unlocks the DB mutex before destroying the `ThreadLocalPtr` holding the per-thread `SuperVersion` pointers when the only remaining reference is the back reference from `super_version_`. The idea behind this was to break the circular dependency between `ColumnFamilyData` and `SuperVersion`: when the penultimate reference goes away, `ColumnFamilyData` can clean up the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there is a `SuperVersion` and it is not referenced by anything else.) However, unlocking the mutex throws a wrench in this plan by making it possible for another thread to jump in and take another reference to the `ColumnFamilyData`, keeping the object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like https://github.com/facebook/rocksdb/issues/8440 , https://github.com/facebook/rocksdb/issues/8382 , and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet` destructor we sometimes observe during close in our stress tests. Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call `SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone. https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced the circular dependency and associated cleanup logic described above (in order to enable iterators to keep the `ColumnFamilyData` for dropped column families alive), and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`. Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete` in `SuperVersion::Cleanup`. The patch simply eliminates the unlocking and relocking, which has been unnecessary ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free. This closes the window during which another thread could increase the reference count, and hopefully fixes the issues above. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605 Test Plan: Ran `make check` and stress tests locally. Reviewed By: pdillinger Differential Revision: D30051035 Pulled By: ltamasi fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528 --- db/column_family.cc | 32 +++++++++++--------------- db/column_family.h | 6 +---- db/db_impl/db_impl_compaction_flush.cc | 2 +- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/db/column_family.cc b/db/column_family.cc index 7a8fdd97d..c938b2e0c 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -457,7 +457,7 @@ void SuperVersion::Cleanup() { to_delete.push_back(m); } current->Unref(); - cfd->UnrefAndTryDelete(this); + cfd->UnrefAndTryDelete(); } void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, @@ -475,10 +475,10 @@ void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, namespace { void SuperVersionUnrefHandle(void* ptr) { - // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets - // destroyed. When former happens, the thread shouldn't see kSVInUse. - // When latter happens, we are in ~ColumnFamilyData(), no get should happen as - // well. + // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets + // destroyed. When the former happens, the thread shouldn't see kSVInUse. + // When the latter happens, only super_version_ holds a reference + // to ColumnFamilyData, so no further queries are possible. SuperVersion* sv = static_cast(ptr); bool was_last_ref __attribute__((__unused__)); was_last_ref = sv->Unref(); @@ -668,7 +668,7 @@ ColumnFamilyData::~ColumnFamilyData() { } } -bool ColumnFamilyData::UnrefAndTryDelete(SuperVersion* sv_under_cleanup) { +bool ColumnFamilyData::UnrefAndTryDelete() { int old_refs = refs_.fetch_sub(1); assert(old_refs > 0); @@ -678,22 +678,17 @@ bool ColumnFamilyData::UnrefAndTryDelete(SuperVersion* sv_under_cleanup) { return true; } - // If called under SuperVersion::Cleanup, we should not re-enter Cleanup on - // the same SuperVersion. (But while installing a new SuperVersion, this - // cfd could be referenced only by two SuperVersions.) - if (old_refs == 2 && super_version_ != nullptr && - super_version_ != sv_under_cleanup) { + if (old_refs == 2 && super_version_ != nullptr) { // Only the super_version_ holds me SuperVersion* sv = super_version_; super_version_ = nullptr; - // Release SuperVersion reference kept in ThreadLocalPtr. - // This must be done outside of mutex_ since unref handler can lock mutex. - sv->db_mutex->Unlock(); + + // Release SuperVersion references kept in ThreadLocalPtr. local_sv_.reset(); - sv->db_mutex->Lock(); if (sv->Unref()) { - // May delete this ColumnFamilyData after calling Cleanup() + // Note: sv will delete this ColumnFamilyData during Cleanup() + assert(sv->cfd == this); sv->Cleanup(); delete sv; return true; @@ -1258,14 +1253,13 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { void ColumnFamilyData::InstallSuperVersion( SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) { db_mutex->AssertHeld(); - return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_); + return InstallSuperVersion(sv_context, mutable_cf_options_); } void ColumnFamilyData::InstallSuperVersion( - SuperVersionContext* sv_context, InstrumentedMutex* db_mutex, + SuperVersionContext* sv_context, const MutableCFOptions& mutable_cf_options) { SuperVersion* new_superversion = sv_context->new_superversion.release(); - new_superversion->db_mutex = db_mutex; new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->Init(this, mem_, imm_.current(), current_); SuperVersion* old_superversion = super_version_; diff --git a/db/column_family.h b/db/column_family.h index a2e7d026a..99106c612 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -208,8 +208,6 @@ struct SuperVersion { uint64_t version_number; WriteStallCondition write_stall_condition; - InstrumentedMutex* db_mutex; - // should be called outside the mutex SuperVersion() = default; ~SuperVersion(); @@ -281,8 +279,7 @@ class ColumnFamilyData { // UnrefAndTryDelete() decreases the reference count and do free if needed, // return true if this is freed else false, UnrefAndTryDelete() can only // be called while holding a DB mutex, or during single-threaded recovery. - // sv_under_cleanup is only provided when called from SuperVersion::Cleanup. - bool UnrefAndTryDelete(SuperVersion* sv_under_cleanup = nullptr); + bool UnrefAndTryDelete(); // SetDropped() can only be called under following conditions: // 1) Holding a DB mutex, @@ -453,7 +450,6 @@ class ColumnFamilyData { // the clients to allocate SuperVersion outside of mutex. // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() void InstallSuperVersion(SuperVersionContext* sv_context, - InstrumentedMutex* db_mutex, const MutableCFOptions& mutable_cf_options); void InstallSuperVersion(SuperVersionContext* sv_context, InstrumentedMutex* db_mutex); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 515bd7422..e4312bbcf 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -3469,7 +3469,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork( if (UNLIKELY(sv_context->new_superversion == nullptr)) { sv_context->NewSuperVersion(); } - cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options); + cfd->InstallSuperVersion(sv_context, mutable_cf_options); // There may be a small data race here. The snapshot tricking bottommost // compaction may already be released here. But assuming there will always be