Support concurrent CF iteration and drop (#6147)

Summary:
It's easy to cause coredump when closing ColumnFamilyHandle with unreleased iterators, especially iterators release is controlled by java GC when using JNI.

This patch fixed concurrent CF iteration and drop, we let iterators(actually SuperVersion) hold a ColumnFamilyData reference to prevent the CF from being released too early.

fixed https://github.com/facebook/rocksdb/issues/5982
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6147

Differential Revision: D18926378

fbshipit-source-id: 1dff6d068c603d012b81446812368bfee95a5e15
main
Jermy Li 5 years ago committed by Facebook Github Bot
parent 4b74035e40
commit c2029f9716
  1. 68
      db/column_family.cc
  2. 10
      db/column_family.h
  3. 4
      db/compaction/compaction.cc
  4. 2
      db/db_filesnapshot.cc
  5. 15
      db/db_impl/db_impl.cc
  6. 26
      db/db_impl/db_impl_compaction_flush.cc
  7. 2
      db/db_impl/db_impl_open.cc
  8. 10
      db/db_impl/db_impl_write.cc
  9. 8
      db/flush_scheduler.cc
  10. 9
      db/trim_history_scheduler.cc
  11. 13
      db/version_set.cc

@ -60,11 +60,8 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options(); ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
JobContext job_context(0); JobContext job_context(0);
mutex_->Lock(); mutex_->Lock();
if (cfd_->Unref()) {
bool dropped = cfd_->IsDropped(); bool dropped = cfd_->IsDropped();
if (cfd_->UnrefAndTryDelete()) {
delete cfd_;
if (dropped) { if (dropped) {
db_->FindObsoleteFiles(&job_context, false, true); db_->FindObsoleteFiles(&job_context, false, true);
} }
@ -439,13 +436,18 @@ void SuperVersion::Cleanup() {
to_delete.push_back(m); to_delete.push_back(m);
} }
current->Unref(); current->Unref();
if (cfd->Unref()) {
delete cfd;
}
} }
void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm, void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
Version* new_current) { MemTableListVersion* new_imm, Version* new_current) {
cfd = new_cfd;
mem = new_mem; mem = new_mem;
imm = new_imm; imm = new_imm;
current = new_current; current = new_current;
cfd->Ref();
mem->Ref(); mem->Ref();
imm->Ref(); imm->Ref();
current->Ref(); current->Ref();
@ -581,21 +583,7 @@ ColumnFamilyData::~ColumnFamilyData() {
// compaction_queue_ and we destroyed it // compaction_queue_ and we destroyed it
assert(!queued_for_flush_); assert(!queued_for_flush_);
assert(!queued_for_compaction_); assert(!queued_for_compaction_);
assert(super_version_ == nullptr);
if (super_version_ != nullptr) {
// Release SuperVersion reference kept in ThreadLocalPtr.
// This must be done outside of mutex_ since unref handler can lock mutex.
super_version_->db_mutex->Unlock();
local_sv_.reset();
super_version_->db_mutex->Lock();
bool is_last_reference __attribute__((__unused__));
is_last_reference = super_version_->Unref();
assert(is_last_reference);
super_version_->Cleanup();
delete super_version_;
super_version_ = nullptr;
}
if (dummy_versions_ != nullptr) { if (dummy_versions_ != nullptr) {
// List must be empty // List must be empty
@ -615,6 +603,36 @@ ColumnFamilyData::~ColumnFamilyData() {
} }
} }
bool ColumnFamilyData::UnrefAndTryDelete() {
int old_refs = refs_.fetch_sub(1);
assert(old_refs > 0);
if (old_refs == 1) {
assert(super_version_ == nullptr);
delete this;
return true;
}
if (old_refs == 2 && super_version_ != nullptr) {
// Only the super_version_ holds me
SuperVersion* sv = super_version_;
super_version_ = nullptr;
// Release SuperVersion reference kept in ThreadLocalPtr.
// This must be done outside of mutex_ since unref handler can lock mutex.
sv->db_mutex->Unlock();
local_sv_.reset();
sv->db_mutex->Lock();
if (sv->Unref()) {
// May delete this ColumnFamilyData after calling Cleanup()
sv->Cleanup();
delete sv;
return true;
}
}
return false;
}
void ColumnFamilyData::SetDropped() { void ColumnFamilyData::SetDropped() {
// can't drop default CF // can't drop default CF
assert(id_ != 0); assert(id_ != 0);
@ -1169,7 +1187,7 @@ void ColumnFamilyData::InstallSuperVersion(
SuperVersion* new_superversion = sv_context->new_superversion.release(); SuperVersion* new_superversion = sv_context->new_superversion.release();
new_superversion->db_mutex = db_mutex; new_superversion->db_mutex = db_mutex;
new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->mutable_cf_options = mutable_cf_options;
new_superversion->Init(mem_, imm_.current(), current_); new_superversion->Init(this, mem_, imm_.current(), current_);
SuperVersion* old_superversion = super_version_; SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion; super_version_ = new_superversion;
++super_version_number_; ++super_version_number_;
@ -1344,14 +1362,12 @@ ColumnFamilySet::~ColumnFamilySet() {
// cfd destructor will delete itself from column_family_data_ // cfd destructor will delete itself from column_family_data_
auto cfd = column_family_data_.begin()->second; auto cfd = column_family_data_.begin()->second;
bool last_ref __attribute__((__unused__)); bool last_ref __attribute__((__unused__));
last_ref = cfd->Unref(); last_ref = cfd->UnrefAndTryDelete();
assert(last_ref); assert(last_ref);
delete cfd;
} }
bool dummy_last_ref __attribute__((__unused__)); bool dummy_last_ref __attribute__((__unused__));
dummy_last_ref = dummy_cfd_->Unref(); dummy_last_ref = dummy_cfd_->UnrefAndTryDelete();
assert(dummy_last_ref); assert(dummy_last_ref);
delete dummy_cfd_;
} }
ColumnFamilyData* ColumnFamilySet::GetDefault() const { ColumnFamilyData* ColumnFamilySet::GetDefault() const {

@ -198,6 +198,7 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
struct SuperVersion { struct SuperVersion {
// Accessing members of this class is not thread-safe and requires external // Accessing members of this class is not thread-safe and requires external
// synchronization (ie db mutex held or on write thread). // synchronization (ie db mutex held or on write thread).
ColumnFamilyData* cfd;
MemTable* mem; MemTable* mem;
MemTableListVersion* imm; MemTableListVersion* imm;
Version* current; Version* current;
@ -221,8 +222,8 @@ struct SuperVersion {
// that needs to be deleted in to_delete vector. Unrefing those // that needs to be deleted in to_delete vector. Unrefing those
// objects needs to be done in the mutex // objects needs to be done in the mutex
void Cleanup(); void Cleanup();
void Init(MemTable* new_mem, MemTableListVersion* new_imm, void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
Version* new_current); MemTableListVersion* new_imm, Version* new_current);
// The value of dummy is not actually used. kSVInUse takes its address as a // The value of dummy is not actually used. kSVInUse takes its address as a
// mark in the thread local storage to indicate the SuperVersion is in use // mark in the thread local storage to indicate the SuperVersion is in use
@ -288,6 +289,11 @@ class ColumnFamilyData {
return old_refs == 1; return old_refs == 1;
} }
// UnrefAndTryDelete() decreases the reference count and do free if needed,
// return true if this is freed else false, UnrefAndTryDelete() can only
// be called while holding a DB mutex, or during single-threaded recovery.
bool UnrefAndTryDelete();
// SetDropped() can only be called under following conditions: // SetDropped() can only be called under following conditions:
// 1) Holding a DB mutex, // 1) Holding a DB mutex,
// 2) from single-threaded write thread, AND // 2) from single-threaded write thread, AND

@ -275,9 +275,7 @@ Compaction::~Compaction() {
input_version_->Unref(); input_version_->Unref();
} }
if (cfd_ != nullptr) { if (cfd_ != nullptr) {
if (cfd_->Unref()) { cfd_->UnrefAndTryDelete();
delete cfd_;
}
} }
} }

@ -102,7 +102,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
mutex_.Lock(); mutex_.Lock();
cfd->Unref(); cfd->UnrefAndTryDelete();
if (!status.ok()) { if (!status.ok()) {
break; break;
} }

@ -330,7 +330,7 @@ Status DBImpl::ResumeImpl() {
mutex_.Unlock(); mutex_.Unlock();
s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery); s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
mutex_.Lock(); mutex_.Lock();
cfd->Unref(); cfd->UnrefAndTryDelete();
if (!s.ok()) { if (!s.ok()) {
break; break;
} }
@ -418,7 +418,7 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
mutex_.Unlock(); mutex_.Unlock();
FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
mutex_.Lock(); mutex_.Lock();
cfd->Unref(); cfd->UnrefAndTryDelete();
} }
} }
} }
@ -475,17 +475,12 @@ Status DBImpl::CloseHelper() {
while (!flush_queue_.empty()) { while (!flush_queue_.empty()) {
const FlushRequest& flush_req = PopFirstFromFlushQueue(); const FlushRequest& flush_req = PopFirstFromFlushQueue();
for (const auto& iter : flush_req) { for (const auto& iter : flush_req) {
ColumnFamilyData* cfd = iter.first; iter.first->UnrefAndTryDelete();
if (cfd->Unref()) {
delete cfd;
}
} }
} }
while (!compaction_queue_.empty()) { while (!compaction_queue_.empty()) {
auto cfd = PopFirstFromCompactionQueue(); auto cfd = PopFirstFromCompactionQueue();
if (cfd->Unref()) { cfd->UnrefAndTryDelete();
delete cfd;
}
} }
if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) { if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
@ -4303,7 +4298,7 @@ Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
} }
} }
for (auto cfd : cfd_list) { for (auto cfd : cfd_list) {
cfd->Unref(); cfd->UnrefAndTryDelete();
} }
} }
return s; return s;

@ -1618,12 +1618,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
} }
s = WaitForFlushMemTables(cfds, flush_memtable_ids, s = WaitForFlushMemTables(cfds, flush_memtable_ids,
(flush_reason == FlushReason::kErrorRecovery)); (flush_reason == FlushReason::kErrorRecovery));
for (auto* tmp_cfd : cfds) {
if (tmp_cfd->Unref()) {
// Only one thread can reach here.
InstrumentedMutexLock lock_guard(&mutex_); InstrumentedMutexLock lock_guard(&mutex_);
delete tmp_cfd; for (auto* tmp_cfd : cfds) {
} tmp_cfd->UnrefAndTryDelete();
} }
} }
TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished"); TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished");
@ -1683,7 +1680,7 @@ Status DBImpl::AtomicFlushMemTables(
} }
cfd->Ref(); cfd->Ref();
s = SwitchMemtable(cfd, &context); s = SwitchMemtable(cfd, &context);
cfd->Unref(); cfd->UnrefAndTryDelete();
if (!s.ok()) { if (!s.ok()) {
break; break;
} }
@ -1723,12 +1720,9 @@ Status DBImpl::AtomicFlushMemTables(
} }
s = WaitForFlushMemTables(cfds, flush_memtable_ids, s = WaitForFlushMemTables(cfds, flush_memtable_ids,
(flush_reason == FlushReason::kErrorRecovery)); (flush_reason == FlushReason::kErrorRecovery));
for (auto* cfd : cfds) {
if (cfd->Unref()) {
// Only one thread can reach here.
InstrumentedMutexLock lock_guard(&mutex_); InstrumentedMutexLock lock_guard(&mutex_);
delete cfd; for (auto* cfd : cfds) {
} cfd->UnrefAndTryDelete();
} }
} }
return s; return s;
@ -2209,16 +2203,13 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
*reason = bg_flush_args[0].cfd_->GetFlushReason(); *reason = bg_flush_args[0].cfd_->GetFlushReason();
for (auto& arg : bg_flush_args) { for (auto& arg : bg_flush_args) {
ColumnFamilyData* cfd = arg.cfd_; ColumnFamilyData* cfd = arg.cfd_;
if (cfd->Unref()) { if (cfd->UnrefAndTryDelete()) {
delete cfd;
arg.cfd_ = nullptr; arg.cfd_ = nullptr;
} }
} }
} }
for (auto cfd : column_families_not_to_flush) { for (auto cfd : column_families_not_to_flush) {
if (cfd->Unref()) { cfd->UnrefAndTryDelete();
delete cfd;
}
} }
return status; return status;
} }
@ -2547,10 +2538,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
// reference). // reference).
// This will all happen under a mutex so we don't have to be afraid of // This will all happen under a mutex so we don't have to be afraid of
// somebody else deleting it. // somebody else deleting it.
if (cfd->Unref()) { if (cfd->UnrefAndTryDelete()) {
// This was the last reference of the column family, so no need to // This was the last reference of the column family, so no need to
// compact. // compact.
delete cfd;
return Status::OK(); return Status::OK();
} }

@ -920,7 +920,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
ColumnFamilyData* cfd; ColumnFamilyData* cfd;
while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
cfd->Unref(); cfd->UnrefAndTryDelete();
// If this asserts, it means that InsertInto failed in // If this asserts, it means that InsertInto failed in
// filtering updates to already-flushed column families // filtering updates to already-flushed column families
assert(cfd->GetLogNumber() <= log_number); assert(cfd->GetLogNumber() <= log_number);

@ -1256,7 +1256,7 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
for (const auto cfd : cfds) { for (const auto cfd : cfds) {
cfd->Ref(); cfd->Ref();
status = SwitchMemtable(cfd, write_context); status = SwitchMemtable(cfd, write_context);
cfd->Unref(); cfd->UnrefAndTryDelete();
if (!status.ok()) { if (!status.ok()) {
break; break;
} }
@ -1335,7 +1335,7 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
} }
cfd->Ref(); cfd->Ref();
status = SwitchMemtable(cfd, write_context); status = SwitchMemtable(cfd, write_context);
cfd->Unref(); cfd->UnrefAndTryDelete();
if (!status.ok()) { if (!status.ok()) {
break; break;
} }
@ -1525,8 +1525,7 @@ Status DBImpl::TrimMemtableHistory(WriteContext* context) {
assert(context->superversion_context.new_superversion.get() != nullptr); assert(context->superversion_context.new_superversion.get() != nullptr);
cfd->InstallSuperVersion(&context->superversion_context, &mutex_); cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
if (cfd->Unref()) { if (cfd->UnrefAndTryDelete()) {
delete cfd;
cfd = nullptr; cfd = nullptr;
} }
} }
@ -1558,8 +1557,7 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
if (!cfd->mem()->IsEmpty()) { if (!cfd->mem()->IsEmpty()) {
status = SwitchMemtable(cfd, context); status = SwitchMemtable(cfd, context);
} }
if (cfd->Unref()) { if (cfd->UnrefAndTryDelete()) {
delete cfd;
cfd = nullptr; cfd = nullptr;
} }
if (!status.ok()) { if (!status.ok()) {

@ -60,9 +60,7 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
} }
// no longer relevant, retry // no longer relevant, retry
if (cfd->Unref()) { cfd->UnrefAndTryDelete();
delete cfd;
}
} }
} }
@ -80,9 +78,7 @@ bool FlushScheduler::Empty() {
void FlushScheduler::Clear() { void FlushScheduler::Clear() {
ColumnFamilyData* cfd; ColumnFamilyData* cfd;
while ((cfd = TakeNextColumnFamily()) != nullptr) { while ((cfd = TakeNextColumnFamily()) != nullptr) {
if (cfd->Unref()) { cfd->UnrefAndTryDelete();
delete cfd;
}
} }
assert(head_.load(std::memory_order_relaxed) == nullptr); assert(head_.load(std::memory_order_relaxed) == nullptr);
} }

@ -34,10 +34,7 @@ ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
// success // success
return cfd; return cfd;
} }
if (cfd->Unref()) { cfd->UnrefAndTryDelete();
// no longer relevant, retry
delete cfd;
}
} }
} }
@ -49,9 +46,7 @@ bool TrimHistoryScheduler::Empty() {
void TrimHistoryScheduler::Clear() { void TrimHistoryScheduler::Clear() {
ColumnFamilyData* cfd; ColumnFamilyData* cfd;
while ((cfd = TakeNextColumnFamily()) != nullptr) { while ((cfd = TakeNextColumnFamily()) != nullptr) {
if (cfd->Unref()) { cfd->UnrefAndTryDelete();
delete cfd;
}
} }
assert(Empty()); assert(Empty());
} }

@ -3888,9 +3888,7 @@ Status VersionSet::ProcessManifestWrites(
} else if (first_writer.edit_list.front()->is_column_family_drop_) { } else if (first_writer.edit_list.front()->is_column_family_drop_) {
assert(batch_edits.size() == 1); assert(batch_edits.size() == 1);
first_writer.cfd->SetDropped(); first_writer.cfd->SetDropped();
if (first_writer.cfd->Unref()) { first_writer.cfd->UnrefAndTryDelete();
delete first_writer.cfd;
}
} else { } else {
// Each version in versions corresponds to a column family. // Each version in versions corresponds to a column family.
// For each column family, update its log number indicating that logs // For each column family, update its log number indicating that logs
@ -4168,8 +4166,7 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
builders.erase(builder); builders.erase(builder);
cfd = column_family_set_->GetColumnFamily(edit.column_family_); cfd = column_family_set_->GetColumnFamily(edit.column_family_);
assert(cfd != nullptr); assert(cfd != nullptr);
if (cfd->Unref()) { if (cfd->UnrefAndTryDelete()) {
delete cfd;
cfd = nullptr; cfd = nullptr;
} else { } else {
// who else can have reference to cfd!? // who else can have reference to cfd!?
@ -4781,8 +4778,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
comparators.erase(edit.column_family_); comparators.erase(edit.column_family_);
cfd = column_family_set_->GetColumnFamily(edit.column_family_); cfd = column_family_set_->GetColumnFamily(edit.column_family_);
assert(cfd != nullptr); assert(cfd != nullptr);
cfd->Unref(); cfd->UnrefAndTryDelete();
delete cfd;
cfd = nullptr; cfd = nullptr;
} else { } else {
if (!cf_in_builders) { if (!cf_in_builders) {
@ -5808,8 +5804,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
// secondary instance. (Is it possible that the ref count for cfd is 0 but // secondary instance. (Is it possible that the ref count for cfd is 0 but
// the ref count for its versions is higher than 0?) // the ref count for its versions is higher than 0?)
cfd->SetDropped(); cfd->SetDropped();
if (cfd->Unref()) { if (cfd->UnrefAndTryDelete()) {
delete cfd;
cfd = nullptr; cfd = nullptr;
} }
active_version_builders_.erase(builder_iter); active_version_builders_.erase(builder_iter);

Loading…
Cancel
Save