diff --git a/HISTORY.md b/HISTORY.md index 722967431..2a432ff73 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,6 +8,7 @@ * In best-efforts recovery, an error that is not Corruption or IOError::kNotFound or IOError::kPathNotFound will be overwritten silently. Fix this by checking all non-ok cases and return early. * Compressed block cache was automatically disabled with read-only DBs by mistake. Now it is fixed: compressed block cache will be in effective with read-only DB too. * Fix a bug of wrong iterator result if another thread finishes an update and a DB flush between two statement. +* Disable file deletion after MANIFEST write/sync failure until db re-open or Resume() so that subsequent re-open will not see MANIFEST referencing deleted SSTs. ### Public API Change * `DB::GetDbSessionId(std::string& session_id)` is added. `session_id` stores a unique identifier that gets reset every time the DB is opened. This DB session ID should be unique among all open DB instances on all hosts, and should be unique among re-openings of the same or other DBs. This identifier is recorded in the LOG file on the line starting with "DB Session ID:". diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 77fb8db65..d574e9192 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -724,7 +724,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { cfd->internal_stats()->AddCompactionStats( compact_->compaction->output_level(), thread_pri_, compaction_stats_); - versions_->SetIOStatusOK(); + versions_->SetIOStatus(IOStatus::OK()); if (status.ok()) { status = InstallCompactionResults(mutable_cf_options); } diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index cdef126aa..badcacc2d 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -3073,6 +3073,33 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { Close(); } +TEST_F(DBBasicTest, ManifestWriteFailure) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto* s = reinterpret_cast(arg); + ASSERT_OK(*s); + // Manually overwrite return status + *s = Status::IOError(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("key", "value")); + ASSERT_NOK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); +} + } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 8af226c31..e9a094a28 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -23,57 +23,6 @@ namespace ROCKSDB_NAMESPACE { -Status DBImpl::DisableFileDeletions() { - InstrumentedMutexLock l(&mutex_); - ++disable_delete_obsolete_files_; - if (disable_delete_obsolete_files_ == 1) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "File Deletions Disabled, but already disabled. Counter: %d", - disable_delete_obsolete_files_); - } - return Status::OK(); -} - -Status DBImpl::EnableFileDeletions(bool force) { - // Job id == 0 means that this is not our background process, but rather - // user thread - JobContext job_context(0); - bool file_deletion_enabled = false; - { - InstrumentedMutexLock l(&mutex_); - if (force) { - // if force, we need to enable file deletions right away - disable_delete_obsolete_files_ = 0; - } else if (disable_delete_obsolete_files_ > 0) { - --disable_delete_obsolete_files_; - } - if (disable_delete_obsolete_files_ == 0) { - file_deletion_enabled = true; - FindObsoleteFiles(&job_context, true); - bg_cv_.SignalAll(); - } - } - if (file_deletion_enabled) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); - } - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "File Deletions Enable, but not really enabled. Counter: %d", - disable_delete_obsolete_files_); - } - job_context.Clean(); - LogFlush(immutable_db_options_.info_log); - return Status::OK(); -} - -int DBImpl::IsFileDeletionsEnabled() const { - return !disable_delete_obsolete_files_; -} - Status DBImpl::GetLiveFiles(std::vector& ret, uint64_t* manifest_file_size, bool flush_memtable) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 91318f3b5..3a5ccc831 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -313,8 +313,36 @@ Status DBImpl::ResumeImpl() { } // Make sure the IO Status stored in version set is set to OK. + bool file_deletion_disabled = !IsFileDeletionsEnabled(); if (s.ok()) { - versions_->SetIOStatusOK(); + IOStatus io_s = versions_->io_status(); + if (io_s.IsIOError()) { + // If resuming from IOError resulted from MANIFEST write, then assert + // that we must have already set the MANIFEST writer to nullptr during + // clean-up phase MANIFEST writing. We must have also disabled file + // deletions. + assert(!versions_->descriptor_log_); + assert(file_deletion_disabled); + // Since we are trying to recover from MANIFEST write error, we need to + // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted. + // Therefore, force writing a dummy version edit because we do not know + // whether there are flush jobs with non-empty data to flush, triggering + // appends to MANIFEST. + VersionEdit edit; + auto cfh = reinterpret_cast(default_cf_handle_); + assert(cfh); + ColumnFamilyData* cfd = cfh->cfd(); + const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); + s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_, + directories_.GetDbDir()); + if (!s.ok()) { + io_s = versions_->io_status(); + if (!io_s.ok()) { + s = error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWrite); + } + } + } } // We cannot guarantee consistency of the WAL. So force flush Memtables of @@ -365,6 +393,13 @@ Status DBImpl::ResumeImpl() { job_context.Clean(); if (s.ok()) { + assert(versions_->io_status().ok()); + // If we reach here, we should re-enable file deletions if it was disabled + // during previous error handling. + if (file_deletion_disabled) { + // Always return ok + EnableFileDeletions(/*force=*/true); + } ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); } mutex_.Lock(); @@ -4405,6 +4440,14 @@ Status DBImpl::IngestExternalFiles( #endif // !NDEBUG } } + } else if (versions_->io_status().IsIOError()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + const IOStatus& io_s = versions_->io_status(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite); } // Resume writes to the DB diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 91a6393bb..4198cf048 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -358,6 +358,12 @@ class DBImpl : public DB { virtual Status Close() override; + virtual Status DisableFileDeletions() override; + + virtual Status EnableFileDeletions(bool force) override; + + virtual bool IsFileDeletionsEnabled() const; + Status GetStatsHistory( uint64_t start_time, uint64_t end_time, std::unique_ptr* stats_iterator) override; @@ -365,9 +371,6 @@ class DBImpl : public DB { #ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; - virtual Status DisableFileDeletions() override; - virtual Status EnableFileDeletions(bool force) override; - virtual int IsFileDeletionsEnabled() const; // All the returned filenames start with "/" virtual Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, @@ -1789,6 +1792,8 @@ class DBImpl : public DB { SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback, bool* is_blob_index); + Status DisableFileDeletionsWithLock(); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index a58ba97d5..498e1ac0b 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -211,7 +211,15 @@ Status DBImpl::FlushMemTableToOutputFile( if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { if (!io_s.ok() && !io_s.IsShutdownInProgress() && !io_s.IsColumnFamilyDropped()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kFlush + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); } else { Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); @@ -575,7 +583,15 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // it is not because of CF drop. if (!s.ok() && !s.IsColumnFamilyDropped()) { if (!io_s.ok() && !io_s.IsColumnFamilyDropped()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kFlush + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); } else { Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); @@ -2689,7 +2705,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } - versions_->SetIOStatusOK(); + versions_->SetIOStatus(IOStatus::OK()); status = versions_->LogAndApply(c->column_family_data(), *c->mutable_cf_options(), c->edit(), &mutex_, directories_.GetDbDir()); @@ -2747,7 +2763,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } } - versions_->SetIOStatusOK(); + versions_->SetIOStatus(IOStatus::OK()); status = versions_->LogAndApply(c->column_family_data(), *c->mutable_cf_options(), c->edit(), &mutex_, directories_.GetDbDir()); @@ -2880,7 +2896,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", status.ToString().c_str()); if (!io_s.ok()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kCompaction + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); } else { error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index c43772dfd..ea0d12296 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -36,6 +36,62 @@ uint64_t DBImpl::MinObsoleteSstNumberToKeep() { return std::numeric_limits::max(); } +Status DBImpl::DisableFileDeletions() { + InstrumentedMutexLock l(&mutex_); + return DisableFileDeletionsWithLock(); +} + +Status DBImpl::DisableFileDeletionsWithLock() { + mutex_.AssertHeld(); + ++disable_delete_obsolete_files_; + if (disable_delete_obsolete_files_ == 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Disabled, but already disabled. Counter: %d", + disable_delete_obsolete_files_); + } + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + bool file_deletion_enabled = false; + { + InstrumentedMutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + if (disable_delete_obsolete_files_ == 0) { + file_deletion_enabled = true; + FindObsoleteFiles(&job_context, true); + bg_cv_.SignalAll(); + } + } + if (file_deletion_enabled) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Enable, but not really enabled. Counter: %d", + disable_delete_obsolete_files_); + } + job_context.Clean(); + LogFlush(immutable_db_options_.info_log); + return Status::OK(); +} + +bool DBImpl::IsFileDeletionsEnabled() const { + return 0 == disable_delete_obsolete_files_; +} + // * Returns the list of live files in 'sst_live' and 'blob_live'. // If it's doing full scan: // * Returns the list of all files in the filesystem in diff --git a/db/db_test.cc b/db/db_test.cc index 55df60afb..0c932c3c1 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2990,10 +2990,11 @@ class ModelDB : public DB { Status SyncWAL() override { return Status::OK(); } -#ifndef ROCKSDB_LITE Status DisableFileDeletions() override { return Status::OK(); } Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } +#ifndef ROCKSDB_LITE + Status GetLiveFiles(std::vector&, uint64_t* /*size*/, bool /*flush_memtable*/ = true) override { return Status::OK(); diff --git a/db/error_handler.cc b/db/error_handler.cc index 3c99dce99..1f7bbd7ec 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -51,9 +51,19 @@ std::map, Status::Code::kIOError, Status::SubCode::kNoSpace, false), Status::Severity::kHardError}, + // Errors during MANIFEST write + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, }; -std::map, Status::Severity> +std::map, + Status::Severity> DefaultErrorSeverityMap = { // Errors during BG compaction {std::make_tuple(BackgroundErrorReason::kCompaction, @@ -75,11 +85,11 @@ std::map, Status::Severity {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kCorruption, false), Status::Severity::kNoError}, - {std::make_tuple(BackgroundErrorReason::kFlush, - Status::Code::kIOError, true), + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + true), Status::Severity::kFatalError}, - {std::make_tuple(BackgroundErrorReason::kFlush, - Status::Code::kIOError, false), + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + false), Status::Severity::kNoError}, // Errors during Write {std::make_tuple(BackgroundErrorReason::kWriteCallback, @@ -94,30 +104,36 @@ std::map, Status::Severity {std::make_tuple(BackgroundErrorReason::kWriteCallback, Status::Code::kIOError, false), Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, }; std::map, Status::Severity> DefaultReasonMap = { // Errors during BG compaction {std::make_tuple(BackgroundErrorReason::kCompaction, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kCompaction, false), - Status::Severity::kNoError}, + Status::Severity::kNoError}, // Errors during BG flush {std::make_tuple(BackgroundErrorReason::kFlush, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kFlush, false), - Status::Severity::kNoError}, + Status::Severity::kNoError}, // Errors during Write {std::make_tuple(BackgroundErrorReason::kWriteCallback, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kWriteCallback, false), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, // Errors during Memtable update {std::make_tuple(BackgroundErrorReason::kMemTable, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kMemTable, false), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, }; void ErrorHandler::CancelErrorRecovery() { @@ -247,6 +263,10 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, if (recovery_in_prog_ && recovery_error_.ok()) { recovery_error_ = bg_io_err; } + if (BackgroundErrorReason::kManifestWrite == reason) { + // Always returns ok + db_->DisableFileDeletionsWithLock(); + } Status new_bg_io_err = bg_io_err; Status s; if (bg_io_err.GetDataLoss()) { diff --git a/db/internal_stats.cc b/db/internal_stats.cc index f729ee7c7..ff4c5f46c 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -798,7 +798,7 @@ bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, Version* /*version*/) { - *value = db->IsFileDeletionsEnabled(); + *value = db->IsFileDeletionsEnabled() ? 1 : 0; return true; } diff --git a/db/memtable_list.cc b/db/memtable_list.cc index acdba0896..89de07f3d 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -470,7 +470,7 @@ Status MemTableList::TryInstallMemtableFlushResults( } // this can release and reacquire the mutex. - vset->SetIOStatusOK(); + vset->SetIOStatus(IOStatus::OK()); s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, db_directory); *io_s = vset->io_status(); diff --git a/db/version_set.cc b/db/version_set.cc index 899e7125b..8696b57f3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4001,12 +4001,16 @@ Status VersionSet::ProcessManifestWrites( } if (s.ok()) { io_s = SyncManifest(env_, db_options_, descriptor_log_->file()); + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); } if (!io_s.ok()) { io_status_ = io_s; s = io_s; ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n", s.ToString().c_str()); + } else if (io_status_.IsIOError()) { + io_status_ = io_s; } } @@ -4018,6 +4022,8 @@ Status VersionSet::ProcessManifestWrites( if (!io_s.ok()) { io_status_ = io_s; s = io_s; + } else if (io_status_.IsIOError()) { + io_status_ = io_s; } TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest"); } diff --git a/db/version_set.h b/db/version_set.h index d1766d0bf..16661e097 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1162,7 +1162,7 @@ class VersionSet { IOStatus io_status() const { return io_status_; } // Set the IO Status to OK. Called before Manifest write if needed. - void SetIOStatusOK() { io_status_ = IOStatus::OK(); } + void SetIOStatus(const IOStatus& s) { io_status_ = s; } protected: using VersionBuilderMap = diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 1fa3158a8..b6c5dd009 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1266,8 +1266,6 @@ class DB { // updated, false if user attempted to call if with seqnum <= current value. virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0; -#ifndef ROCKSDB_LITE - // Prevent file deletions. Compactions will continue to occur, // but no obsolete files will be deleted. Calling this multiple // times have the same effect as calling it once. @@ -1284,6 +1282,7 @@ class DB { // threads call EnableFileDeletions() virtual Status EnableFileDeletions(bool force = true) = 0; +#ifndef ROCKSDB_LITE // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup // Retrieve the list of all files in the database. The files are diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index d1c953f0f..97570713f 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -117,6 +117,7 @@ enum class BackgroundErrorReason { kCompaction, kWriteCallback, kMemTable, + kManifestWrite, }; enum class WriteStallCondition {