From fbce7a3808094ce53431a6c718ad97e7b2a9b420 Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Fri, 18 Dec 2020 21:33:20 -0800 Subject: [PATCH] Track WAL obsoletion when updating empty CF's log number (#7781) Summary: In the write path, there is an optimization: when a new WAL is created during SwitchMemtable, we update the internal log number of the empty column families to the new WAL. `FindObsoleteFiles` marks a WAL as obsolete if the WAL's log number is less than `VersionSet::MinLogNumberWithUnflushedData`. After updating the empty column families' internal log number, `VersionSet::MinLogNumberWithUnflushedData` might change, so some WALs might become obsolete to be purged from disk. For example, consider there are 3 column families: 0, 1, 2: 1. initially, all the column families' log number is 1; 2. write some data to cf0, and flush cf0, but the flush is pending; 3. now a new WAL 2 is created; 4. write data to cf1 and WAL 2, now cf0's log number is 1, cf1's log number is 2, cf2's log number is 2 (because cf1 and cf2 are empty, so their log numbers will be set to the highest log number); 5. now cf0's flush hasn't finished, flush cf1, a new WAL 3 is created, and cf1's flush finishes, now cf0's log number is 1, cf1's log number is 3, cf2's log number is 3, since WAL 1 still contains data for the unflushed cf0, no WAL can be deleted from disk; 6. now cf0's flush finishes, cf0's log number is 2 (because when cf0 was switching memtable, WAL 3 does not exist yet), cf1's log number is 3, cf2's log number is 3, so WAL 1 can be purged from disk now, but WAL 2 still cannot because `MinLogNumberToKeep()` is 2; 7. write data to cf2 and WAL 3, because cf0 is empty, its log number is updated to 3, so now cf0's log number is 3, cf1's log number is 3, cf2's log number is 3; 8. now if the background threads want to purge obsolete files from disk, WAL 2 can be purged because `MinLogNumberToKeep()` is 3. But there are only two flush results written to MANIFEST: the first is for flushing cf1, and the `MinLogNumberToKeep` is 1, the second is for flushing cf0, and the `MinLogNumberToKeep` is 2. So without this PR, if the DB crashes at this point and try to recover, `WalSet` will still expect WAL 2 to exist. When WAL tracking is enabled, we assume WALs will only become obsolete after a flush result is written to MANIFEST in `MemtableList::TryInstallMemtableFlushResults` (or its atomic flush counterpart). The above situation breaks this assumption. This PR tracks WAL obsoletion if necessary before updating the empty column families' log numbers. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7781 Test Plan: watch existing tests and stress tests to pass. `make -j48 blackbox_crash_test` on devserver Reviewed By: ltamasi Differential Revision: D25631695 Pulled By: cheng-chang fbshipit-source-id: ca7fff967bdb42204b84226063d909893bc0a4ec --- db/column_family.h | 5 +++ db/db_impl/db_impl_write.cc | 67 +++++++++++++++++++++++++++++++------ db/version_set.h | 20 ++++++++++- 3 files changed, 81 insertions(+), 11 deletions(-) diff --git a/db/column_family.h b/db/column_family.h index 857ed818d..94e26ae49 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -350,6 +350,11 @@ class ColumnFamilyData { MemTableList* imm() { return &imm_; } MemTable* mem() { return mem_; } + + bool IsEmpty() { + return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0; + } + Version* current() { return current_; } Version* dummy_versions() { return dummy_versions_; } void SetCurrent(Version* _current); diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 7a92fe611..982359aa5 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1819,17 +1819,64 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { return s; } - for (auto loop_cfd : *versions_->GetColumnFamilySet()) { - // all this is just optimization to delete logs that - // are no longer needed -- if CF is empty, that means it - // doesn't need that particular log to stay alive, so we just - // advance the log number. no need to persist this in the manifest - if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && - loop_cfd->imm()->NumNotFlushed() == 0) { - if (creating_new_log) { - loop_cfd->SetLogNumber(logfile_number_); + bool empty_cf_updated = false; + if (immutable_db_options_.track_and_verify_wals_in_manifest && + !immutable_db_options_.allow_2pc && creating_new_log) { + // In non-2pc mode, WALs become obsolete if they do not contain unflushed + // data. Updating the empty CF's log number might cause some WALs to become + // obsolete. So we should track the WAL obsoletion event before actually + // updating the empty CF's log number. + uint64_t min_wal_number_to_keep = + versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_); + if (min_wal_number_to_keep > + versions_->GetWalSet().GetMinWalNumberToKeep()) { + // Get a snapshot of the empty column families. + // LogAndApply may release and reacquire db + // mutex, during that period, column family may become empty (e.g. its + // flush succeeds), then it affects the computed min_log_number_to_keep, + // so we take a snapshot for consistency of column family data + // status. If a column family becomes non-empty afterwards, its active log + // should still be the created new log, so the min_log_number_to_keep is + // not affected. + autovector empty_cfs; + for (auto cf : *versions_->GetColumnFamilySet()) { + if (cf->IsEmpty()) { + empty_cfs.push_back(cf); + } + } + + VersionEdit wal_deletion; + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + if (!s.ok()) { + return s; + } + + for (auto cf : empty_cfs) { + if (cf->IsEmpty()) { + cf->SetLogNumber(logfile_number_); + cf->mem()->SetCreationSeq(versions_->LastSequence()); + } // cf may become non-empty. + } + empty_cf_updated = true; + } + } + if (!empty_cf_updated) { + for (auto cf : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (cf->IsEmpty()) { + if (creating_new_log) { + cf->SetLogNumber(logfile_number_); + } + cf->mem()->SetCreationSeq(versions_->LastSequence()); } - loop_cfd->mem()->SetCreationSeq(versions_->LastSequence()); } } diff --git a/db/version_set.h b/db/version_set.h index 78b26ba4d..7cada5f46 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1127,10 +1127,28 @@ class VersionSet { return PreComputeMinLogNumberWithUnflushedData(nullptr); } // Returns the minimum log number which still has data not flushed to any SST + // file. + // Empty column families' log number is considered to be + // new_log_number_for_empty_cf. + uint64_t PreComputeMinLogNumberWithUnflushedData( + uint64_t new_log_number_for_empty_cf) const { + uint64_t min_log_num = port::kMaxUint64; + for (auto cfd : *column_family_set_) { + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + uint64_t num = + cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber(); + if (min_log_num > num && !cfd->IsDropped()) { + min_log_num = num; + } + } + return min_log_num; + } + // Returns the minimum log number which still has data not flushed to any SST // file, except data from `cfd_to_skip`. uint64_t PreComputeMinLogNumberWithUnflushedData( const ColumnFamilyData* cfd_to_skip) const { - uint64_t min_log_num = std::numeric_limits::max(); + uint64_t min_log_num = port::kMaxUint64; for (auto cfd : *column_family_set_) { if (cfd == cfd_to_skip) { continue;