Fix bug affecting GetSortedWalFiles, Backups, Checkpoint (#9208)

Summary:
Saw error like this:
`Backup failed -- IO error: No such file or directory: While opening a
file for sequentially reading:
/dev/shm/rocksdb/rocksdb_crashtest_blackbox/004426.log: No such file or
directory`

Unfortunately, GetSortedWalFiles (used by Backups, Checkpoint, etc.)
relies on no file deletions happening while its operating, which
means not only disabling (more) deletions, but ensuring any pending
deletions are completed. Two fixes related to this:

* There was a gap in several places between decrementing
pending_purge_obsolete_files_ and incrementing bg_purge_scheduled_ where
the db mutex would be released and GetSortedWalFiles (and others) could
get false information that no deletions are pending.

* The fix to https://github.com/facebook/rocksdb/issues/8591 (disabling deletions in GetSortedWalFiles) seems
incomplete because it doesn't prevent pending deletions from occuring
during the operation (if deletions not already disabled, the case that
was to be fixed by the change).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9208

Test Plan:
existing tests (it's hard to write a test for interleavings
that are now excluded - this is what stress test is for)

Reviewed By: ajkr

Differential Revision: D32630675

Pulled By: pdillinger

fbshipit-source-id: a121e3da648de130cd24d44c524232f4eb22f178
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent 5384f0af6e
commit 2a67d475f1
  1. 5
      db/column_family.cc
  2. 37
      db/db_filesnapshot.cc
  3. 14
      db/db_impl/db_impl.cc
  4. 12
      db/db_impl/db_impl_files.cc

@ -75,11 +75,6 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
bool defer_purge =
db_->immutable_db_options().avoid_unnecessary_blocking_io;
db_->PurgeObsoleteFiles(job_context, defer_purge);
if (defer_purge) {
mutex_->Lock();
db_->SchedulePurge();
mutex_->Unlock();
}
}
job_context.Clean();
}

@ -127,31 +127,30 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
}
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
// If caller disabled deletions, this function should return files that are
// guaranteed not to be deleted until deletions are re-enabled. We need to
// wait for pending purges to finish since WalManager doesn't know which
// files are going to be purged. Additional purges won't be scheduled as
// long as deletions are disabled (so the below loop must terminate).
// Also note that we disable deletions anyway to avoid the case where a
// file is deleted in the middle of the scan, causing IO error.
Status deletions_disabled = DisableFileDeletions();
{
// If caller disabled deletions, this function should return files that are
// guaranteed not to be deleted until deletions are re-enabled. We need to
// wait for pending purges to finish since WalManager doesn't know which
// files are going to be purged. Additional purges won't be scheduled as
// long as deletions are disabled (so the below loop must terminate).
InstrumentedMutexLock l(&mutex_);
while (disable_delete_obsolete_files_ > 0 &&
(pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0)) {
while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
bg_cv_.Wait();
}
}
// Disable deletion in order to avoid the case where a file is deleted in
// the middle of the process so IO error is returned.
Status s = DisableFileDeletions();
bool file_deletion_supported = !s.IsNotSupported();
if (s.ok() || !file_deletion_supported) {
s = wal_manager_.GetSortedWalFiles(files);
if (file_deletion_supported) {
Status s2 = EnableFileDeletions(false);
if (!s2.ok() && s.ok()) {
s = s2;
}
}
Status s = wal_manager_.GetSortedWalFiles(files);
// DisableFileDeletions / EnableFileDeletions not supported in read-only DB
if (deletions_disabled.ok()) {
Status s2 = EnableFileDeletions(/*force*/ false);
assert(s2.ok());
s2.PermitUncheckedError();
} else {
assert(deletions_disabled.IsNotSupported());
}
return s;

@ -1546,6 +1546,8 @@ void DBImpl::BackgroundCallPurge() {
mutex_.Lock();
}
assert(bg_purge_scheduled_ > 0);
// Can't use iterator to go over purge_files_ because inside the loop we're
// unlocking the mutex that protects purge_files_.
while (!purge_files_.empty()) {
@ -1613,17 +1615,7 @@ static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
delete state->super_version;
}
if (job_context.HaveSomethingToDelete()) {
if (state->background_purge) {
// PurgeObsoleteFiles here does not delete files. Instead, it adds the
// files to be deleted to a job queue, and deletes it in a separate
// background thread.
state->db->PurgeObsoleteFiles(job_context, true /* schedule only */);
state->mu->Lock();
state->db->SchedulePurge();
state->mu->Unlock();
} else {
state->db->PurgeObsoleteFiles(job_context);
}
state->db->PurgeObsoleteFiles(job_context, state->background_purge);
}
job_context.Clean();
}

@ -56,6 +56,8 @@ Status DBImpl::DisableFileDeletions() {
return s;
}
// FIXME: can be inconsistent with DisableFileDeletions in cases like
// DBImplReadOnly
Status DBImpl::DisableFileDeletionsWithLock() {
mutex_.AssertHeld();
++disable_delete_obsolete_files_;
@ -642,6 +644,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
InstrumentedMutexLock l(&mutex_);
--pending_purge_obsolete_files_;
assert(pending_purge_obsolete_files_ >= 0);
if (schedule_only) {
// Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_
// while holding mutex (for GetSortedWalFiles() etc.)
SchedulePurge();
}
if (pending_purge_obsolete_files_ == 0) {
bg_cv_.SignalAll();
}
@ -657,11 +664,6 @@ void DBImpl::DeleteObsoleteFiles() {
if (job_context.HaveSomethingToDelete()) {
bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io;
PurgeObsoleteFiles(job_context, defer_purge);
if (defer_purge) {
mutex_.Lock();
SchedulePurge();
mutex_.Unlock();
}
}
job_context.Clean();
mutex_.Lock();

Loading…
Cancel
Save