From 1e9bf25f61fa5b08fe2b3fe5c1fcf33e4c1c515c Mon Sep 17 00:00:00 2001 From: Wallace Date: Thu, 21 Jul 2022 13:35:36 -0700 Subject: [PATCH] Do not hold mutex when write keys if not necessary (#7516) Summary: ## Problem Summary RocksDB will acquire the global mutex of db instance for every time when user calls `Write`. When RocksDB schedules a lot of compaction jobs, it will compete the mutex with write thread and it will hurt the write performance. ## Problem Solution: I want to use log_write_mutex to replace the global mutex in most case so that we do not acquire it in write-thread unless there is a write-stall event or a write-buffer-full event occur. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7516 Test Plan: 1. make check 2. CI 3. COMPILE_WITH_TSAN=1 make db_stress make crash_test make crash_test_with_multiops_wp_txn make crash_test_with_multiops_wc_txn make crash_test_with_atomic_flush Reviewed By: siying Differential Revision: D36908702 Pulled By: riversand963 fbshipit-source-id: 59b13881f4f5c0a58fd3ca79128a396d9cd98efe --- HISTORY.md | 1 + db/db_compaction_test.cc | 14 +-- db/db_impl/db_impl.cc | 93 +++++++++++-------- db/db_impl/db_impl.h | 121 +++++++++++++++++++++---- db/db_impl/db_impl_compaction_flush.cc | 32 +++++-- db/db_impl/db_impl_debug.cc | 2 +- db/db_impl/db_impl_files.cc | 53 ++++++----- db/db_impl/db_impl_open.cc | 14 --- db/db_impl/db_impl_write.cc | 117 +++++++++++++----------- db/error_handler.cc | 7 ++ db/error_handler.h | 19 ++-- db/perf_context_test.cc | 2 +- db/version_set_test.cc | 14 +-- db/wal_edit.cc | 37 +++++--- db/wal_edit.h | 10 ++ db/wal_edit_test.cc | 9 +- utilities/backup/backup_engine_test.cc | 3 +- 17 files changed, 340 insertions(+), 208 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 5b8c42207..21ebb1046 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -99,6 +99,7 @@ ### Behavior changes * DB::Open(), DB::OpenAsSecondary() will fail if a Logger cannot be created (#9984) +* DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516). * Removed support for reading Bloom filters using obsolete block-based filter format. (Support for writing such filters was dropped in 7.0.) For good read performance on old DBs using these filters, a full compaction is required. * Per KV checksum in write batch is verified before a write batch is written to WAL to detect any corruption to the write batch (#10114). diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 93362b323..0820fe288 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -5659,18 +5659,10 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { for (int j = 0; j != kNumKeysPerFile; ++j) { ASSERT_OK(Put(Key(j), rnd.RandomString(990))); } - if (0 == i) { - // When we reach here, the memtables have kNumKeysPerFile keys. Note that - // flush is not yet triggered. We need to write an extra key so that the - // write path will call PreprocessWrite and flush the previous key-value - // pairs to e flushed. After that, there will be the newest key in the - // memtable, and a bunch of L0 files. Since there is already one key in - // the memtable, then for i = 1, 2, ..., we do not have to write this - // extra key to trigger flush. - ASSERT_OK(Put("", "")); + if (i > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i); } - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1); } // When we reach this point, there will be level0_stop_writes_trigger L0 // files and one extra key (99) in memory, which overlaps with the external diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 7ac73671f..7b0091074 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -185,7 +185,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, log_dir_synced_(false), log_empty_(true), persist_stats_cf_handle_(nullptr), - log_sync_cv_(&mutex_), + log_sync_cv_(&log_write_mutex_), total_log_size_(0), is_snapshot_supported_(true), write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), @@ -273,6 +273,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, mutable_db_options_.Dump(immutable_db_options_.info_log.get()); DumpSupportInfo(immutable_db_options_.info_log.get()); + max_total_wal_size_.store(mutable_db_options_.max_total_wal_size, + std::memory_order_relaxed); if (write_buffer_manager_) { wbm_stall_.reset(new WBMStallInterface()); } @@ -625,26 +627,28 @@ Status DBImpl::CloseHelper() { job_context.Clean(); mutex_.Lock(); } - - for (auto l : logs_to_free_) { - delete l; - } - for (auto& log : logs_) { - uint64_t log_number = log.writer->get_log_number(); - Status s = log.ClearWriter(); - if (!s.ok()) { - ROCKS_LOG_WARN( - immutable_db_options_.info_log, - "Unable to Sync WAL file %s with error -- %s", - LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(), - s.ToString().c_str()); - // Retain the first error - if (ret.ok()) { - ret = s; + { + InstrumentedMutexLock lock(&log_write_mutex_); + for (auto l : logs_to_free_) { + delete l; + } + for (auto& log : logs_) { + uint64_t log_number = log.writer->get_log_number(); + Status s = log.ClearWriter(); + if (!s.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to Sync WAL file %s with error -- %s", + LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(), + s.ToString().c_str()); + // Retain the first error + if (ret.ok()) { + ret = s; + } } } + logs_.clear(); } - logs_.clear(); // Table cache may have table handles holding blocks from the block cache. // We need to release them before the block cache is destroyed. The block @@ -1108,6 +1112,7 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, } void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { + mutex_.AssertHeld(); if (!job_context->logs_to_free.empty()) { for (auto l : job_context->logs_to_free) { AddToLogsToFreeQueue(l); @@ -1285,6 +1290,11 @@ Status DBImpl::SetDBOptions( new_options.stats_persist_period_sec); mutex_.Lock(); } + if (new_options.max_total_wal_size != + mutable_db_options_.max_total_wal_size) { + max_total_wal_size_.store(new_options.max_total_wal_size, + std::memory_order_release); + } write_controller_.set_max_delayed_write_rate( new_options.delayed_write_rate); table_cache_.get()->SetCapacity(new_options.max_open_files == -1 @@ -1405,7 +1415,7 @@ Status DBImpl::SyncWAL() { uint64_t current_log_number; { - InstrumentedMutexLock l(&mutex_); + InstrumentedMutexLock l(&log_write_mutex_); assert(!logs_.empty()); // This SyncWAL() call only cares about logs up to this number. @@ -1462,19 +1472,37 @@ Status DBImpl::SyncWAL() { TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); + VersionEdit synced_wals; { - InstrumentedMutexLock l(&mutex_); + InstrumentedMutexLock l(&log_write_mutex_); if (status.ok()) { - status = MarkLogsSynced(current_log_number, need_log_dir_sync); + MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals); } else { MarkLogsNotSynced(current_log_number); } } + if (status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + status = ApplyWALToManifest(&synced_wals); + } + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); return status; } +Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) { + // not empty, write to MANIFEST. + mutex_.AssertHeld(); + Status status = + versions_->LogAndApplyToDefaultColumnFamily(synced_wals, &mutex_); + if (!status.ok() && versions_->io_status().IsIOError()) { + status = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + return status; +} + Status DBImpl::LockWAL() { log_write_mutex_.Lock(); auto cur_log_writer = logs_.back().writer; @@ -1494,12 +1522,12 @@ Status DBImpl::UnlockWAL() { return Status::OK(); } -Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) { - mutex_.AssertHeld(); +void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, + VersionEdit* synced_wals) { + log_write_mutex_.AssertHeld(); if (synced_dir && logfile_number_ == up_to) { log_dir_synced_ = true; } - VersionEdit synced_wals; for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { auto& wal = *it; assert(wal.IsSyncing()); @@ -1507,11 +1535,9 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) { if (logs_.size() > 1) { if (immutable_db_options_.track_and_verify_wals_in_manifest && wal.GetPreSyncSize() > 0) { - synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); + synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); } logs_to_free_.push_back(wal.ReleaseWriter()); - // To modify logs_ both mutex_ and log_write_mutex_ must be held - InstrumentedMutexLock l(&log_write_mutex_); it = logs_.erase(it); } else { wal.FinishSync(); @@ -1520,22 +1546,11 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) { } assert(logs_.empty() || logs_[0].number > up_to || (logs_.size() == 1 && !logs_[0].IsSyncing())); - - Status s; - if (synced_wals.IsWalAddition()) { - // not empty, write to MANIFEST. - s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_); - if (!s.ok() && versions_->io_status().IsIOError()) { - s = error_handler_.SetBGError(versions_->io_status(), - BackgroundErrorReason::kManifestWrite); - } - } log_sync_cv_.SignalAll(); - return s; } void DBImpl::MarkLogsNotSynced(uint64_t up_to) { - mutex_.AssertHeld(); + log_write_mutex_.AssertHeld(); for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to; ++it) { auto& wal = *it; diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index cecff3d1d..51dad1a63 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -998,6 +998,7 @@ class DBImpl : public DB { } void AddToLogsToFreeQueue(log::Writer* log_writer) { + mutex_.AssertHeld(); logs_to_free_queue_.push_back(log_writer); } @@ -1298,7 +1299,7 @@ class DBImpl : public DB { // only used for dynamically adjusting max_total_wal_size. it is a sum of // [write_buffer_size * max_write_buffer_number] over all column families - uint64_t max_total_in_memory_state_; + std::atomic max_total_in_memory_state_; // The options to access storage files const FileOptions file_options_; @@ -1648,6 +1649,15 @@ class DBImpl : public DB { uint64_t pre_sync_size = 0; }; + struct LogContext { + explicit LogContext(bool need_sync = false) + : need_log_sync(need_sync), need_log_dir_sync(need_sync) {} + bool need_log_sync = false; + bool need_log_dir_sync = false; + log::Writer* writer = nullptr; + LogFileNumberSize* log_file_number_size = nullptr; + }; + // PurgeFileInfo is a structure to hold information of files to be deleted in // purge_files_ struct PurgeFileInfo { @@ -1801,7 +1811,7 @@ class DBImpl : public DB { void ReleaseFileNumberFromPendingOutputs( std::unique_ptr::iterator>& v); - IOStatus SyncClosedLogs(JobContext* job_context); + IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals); // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. Then @@ -1961,8 +1971,8 @@ class DBImpl : public DB { Status HandleWriteBufferManagerFlush(WriteContext* write_context); // REQUIRES: mutex locked - Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync, - WriteContext* write_context); + Status PreprocessWrite(const WriteOptions& write_options, + LogContext* log_context, WriteContext* write_context); // Merge write batches in the write group into merged_batch. // Returns OK if merge is successful. @@ -2101,7 +2111,8 @@ class DBImpl : public DB { std::unique_ptr* token, LogBuffer* log_buffer); // helper function to call after some of the logs_ were synced - Status MarkLogsSynced(uint64_t up_to, bool synced_dir); + void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); + Status ApplyWALToManifest(VersionEdit* edit); // WALs with log number up to up_to are not synced successfully. void MarkLogsNotSynced(uint64_t up_to); @@ -2307,8 +2318,9 @@ class DBImpl : public DB { // logfile_number_ is currently updated only in write_thread_, it can be read // from the same write_thread_ without any locks. uint64_t logfile_number_; - std::deque - log_recycle_files_; // a list of log files that we can recycle + // Log files that we can recycle. Must be protected by db mutex_. + std::deque log_recycle_files_; + // Protected by log_write_mutex_. bool log_dir_synced_; // Without two_write_queues, read and writes to log_empty_ are protected by // mutex_. Since it is currently updated/read only in write_thread_, it can be @@ -2322,26 +2334,93 @@ class DBImpl : public DB { bool persistent_stats_cfd_exists_ = true; - // Without two_write_queues, read and writes to alive_log_files_ are - // protected by mutex_. With two_write_queues_, writes - // are protected by locking both mutex_ and log_write_mutex_, and reads must - // be under either mutex_ or log_write_mutex_. + // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details + // as follows: + // 1. read by FindObsoleteFiles() which can be called in either application + // thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are + // held. + // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_ + // are held. + // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles() + // (actually called by Open()), only mutex_ is held because at this point, + // the DB::Open() call has not returned success to application, and the + // only other thread(s) that can conflict are bg threads calling + // FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_ + // are held when accessing alive_log_files_. + // 4. read by DBImpl::Open() is protected by mutex_. + // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are + // held. This is done by the write group leader. Note that in the case of + // two-write-queues, another WAL-only write thread can be writing to the + // WAL concurrently. See 9. + // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is + // done by write group leader. + // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of + // two-write-queues. Only log_write_mutex_ is held to protect concurrent + // pop_front() by FindObsoleteFiles(). + // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_ + // is held to protect the data structure from concurrent pop_front() by + // FindObsoleteFiles(). + // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case + // of two-write-queues. Only log_write_mutex_ is held. This suffices to + // protect the data structure from concurrent push_back() by current + // write group leader as well as pop_front() by FindObsoleteFiles(). std::deque alive_log_files_; // Log files that aren't fully synced, and the current log file. // Synchronization: - // - push_back() is done from write_thread_ with locked mutex_ and - // log_write_mutex_ - // - pop_front() is done from any thread with locked mutex_ and - // log_write_mutex_ - // - reads are done with either locked mutex_ or log_write_mutex_ + // 1. read by FindObsoleteFiles() which can be called either in application + // thread or RocksDB bg threads. log_write_mutex_ is always held, while + // some reads are performed without mutex_. + // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held. + // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_. + // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex. + // Note that at this point, DB::Open() has not returned success to + // application, thus the only other thread(s) that can conflict are bg + // threads calling FindObsoleteFiles(). See 1. + // 5. iteration and clear() from CloseHelper() always hold log_write_mutex + // and mutex_. + // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only + // log_write_mutex_. These two can be called by application threads after + // DB::Open() returns success to applications. + // 7. read by SyncWAL(), another API, protected by only log_write_mutex_. + // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by + // log_write_mutex_. + // 9. erase() by MarkLogsSynced() protected by log_write_mutex_. + // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can + // happen in bg flush threads after DB::Open() returns success to + // applications. + // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite() + // holds only the log_write_mutex_. This is done by the write group + // leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced() + // can happen concurrently. This is fine because log_write_mutex_ is used + // by all parties. See 2, 5, 9. + // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and + // log_write_mutex_. This happens in the write group leader. + // 13. emplace_back() by SwitchMemtable() hold both mutex_ and + // log_write_mutex_. This happens in the write group leader. Can conflict + // with bg threads calling FindObsoleteFiles(), MarkLogsSynced(), + // SyncClosedLogs(), etc. as well as application threads calling + // FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties + // require at least log_write_mutex_. + // 14. iteration called in WriteToWAL(write_group) protected by + // log_write_mutex_. This is done by write group leader when + // two-write-queues is disabled and write needs to sync logs. + // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_. + // This can be done by the write group leader if two-write-queues is + // enabled. It can also be done by another WAL-only write thread. + // + // Other observations: // - back() and items with getting_synced=true are not popped, // - The same thread that sets getting_synced=true will reset it. // - it follows that the object referred by back() can be safely read from - // the write_thread_ without using mutex + // the write_thread_ without using mutex. Note that calling back() without + // mutex may be unsafe because different implementations of deque::back() may + // access other member variables of deque, causing undefined behaviors. + // Generally, do not access stl containers without proper synchronization. // - it follows that the items with getting_synced=true can be safely read // from the same thread that has set getting_synced=true std::deque logs_; + // Signaled when getting_synced becomes false for some of the logs_. InstrumentedCondVar log_sync_cv_; // This is the app-level state that is written to the WAL but will be used @@ -2356,7 +2435,7 @@ class DBImpl : public DB { std::atomic total_log_size_; // If this is non-empty, we need to delete these log files in background - // threads. Protected by db mutex. + // threads. Protected by log_write_mutex_. autovector logs_to_free_; bool is_snapshot_supported_; @@ -2436,10 +2515,13 @@ class DBImpl : public DB { // JobContext. Current implementation tracks table and blob files only. std::unordered_set files_grabbed_for_purge_; - // A queue to store log writers to close + // A queue to store log writers to close. Protected by db mutex_. std::deque logs_to_free_queue_; + std::deque superversions_to_free_queue_; + int unscheduled_flushes_; + int unscheduled_compactions_; // count how many background compactions are running or have been scheduled in @@ -2592,6 +2674,7 @@ class DBImpl : public DB { InstrumentedCondVar atomic_flush_install_cv_; bool wal_in_db_path_; + std::atomic max_total_wal_size_; BlobFileCompletionCallback blob_callback_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 91c8bb080..351431e52 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -82,9 +82,10 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force, return false; } -IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) { +IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, + VersionEdit* synced_wals) { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); - mutex_.AssertHeld(); + InstrumentedMutexLock l(&log_write_mutex_); autovector logs_to_sync; uint64_t current_log_number = logfile_number_; while (logs_.front().number < current_log_number && @@ -100,7 +101,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) { IOStatus io_s; if (!logs_to_sync.empty()) { - mutex_.Unlock(); + log_write_mutex_.Unlock(); assert(job_context); @@ -128,12 +129,12 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) { TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock", /*arg=*/nullptr); - mutex_.Lock(); + log_write_mutex_.Lock(); // "number <= current_log_number - 1" is equivalent to // "number < current_log_number". if (io_s.ok()) { - io_s = status_to_io_status(MarkLogsSynced(current_log_number - 1, true)); + MarkLogsSynced(current_log_number - 1, true, synced_wals); } else { MarkLogsNotSynced(current_log_number - 1); } @@ -220,8 +221,16 @@ Status DBImpl::FlushMemTableToOutputFile( bool need_cancel = false; IOStatus log_io_s = IOStatus::OK(); if (needs_to_sync_closed_wals) { - // SyncClosedLogs() may unlock and re-lock the db_mutex. - log_io_s = SyncClosedLogs(job_context); + // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple + // times. + VersionEdit synced_wals; + mutex_.Unlock(); + log_io_s = SyncClosedLogs(job_context, &synced_wals); + mutex_.Lock(); + if (log_io_s.ok() && synced_wals.IsWalAddition()) { + log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + } + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && !log_io_s.IsColumnFamilyDropped()) { error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); @@ -474,7 +483,14 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (logfile_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for // single column family case. - log_io_s = SyncClosedLogs(job_context); + VersionEdit synced_wals; + mutex_.Unlock(); + log_io_s = SyncClosedLogs(job_context, &synced_wals); + mutex_.Lock(); + if (log_io_s.ok() && synced_wals.IsWalAddition()) { + log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + } + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && !log_io_s.IsColumnFamilyDropped()) { if (total_log_size_ > 0) { diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 8ea09e883..0252a3524 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -223,7 +223,7 @@ void DBImpl::TEST_EndWrite(void* w) { } size_t DBImpl::TEST_LogsToFreeSize() { - InstrumentedMutexLock l(&mutex_); + InstrumentedMutexLock l(&log_write_mutex_); return logs_to_free_.size(); } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 749623cd1..da87aa5cd 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -271,6 +271,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // logs_ is empty when called during recovery, in which case there can't yet // be any tracked obsolete logs + log_write_mutex_.Lock(); + + if (alive_log_files_.empty() || logs_.empty()) { + mutex_.AssertHeld(); + // We may reach here if the db is DBImplSecondary + log_write_mutex_.Unlock(); + return; + } + if (!alive_log_files_.empty() && !logs_.empty()) { uint64_t min_log_number = job_context->log_number; size_t num_alive_log_files = alive_log_files_.size(); @@ -292,17 +301,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } job_context->size_log_to_delete += earliest.size; total_log_size_ -= earliest.size; - if (two_write_queues_) { - log_write_mutex_.Lock(); - } alive_log_files_.pop_front(); - if (two_write_queues_) { - log_write_mutex_.Unlock(); - } + // Current log should always stay alive since it can't have // number < MinLogNumber(). assert(alive_log_files_.size()); } + log_write_mutex_.Unlock(); + mutex_.Unlock(); + log_write_mutex_.Lock(); while (!logs_.empty() && logs_.front().number < min_log_number) { auto& log = logs_.front(); if (log.IsSyncing()) { @@ -311,10 +318,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, continue; } logs_to_free_.push_back(log.ReleaseWriter()); - { - InstrumentedMutexLock wl(&log_write_mutex_); - logs_.pop_front(); - } + logs_.pop_front(); } // Current log cannot be obsolete. assert(!logs_.empty()); @@ -323,23 +327,13 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // We're just cleaning up for DB::Write(). assert(job_context->logs_to_free.empty()); job_context->logs_to_free = logs_to_free_; + + logs_to_free_.clear(); + log_write_mutex_.Unlock(); + mutex_.Lock(); job_context->log_recycle_files.assign(log_recycle_files_.begin(), log_recycle_files_.end()); - logs_to_free_.clear(); -} - -namespace { -bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, - const JobContext::CandidateFileInfo& second) { - if (first.file_name > second.file_name) { - return true; - } else if (first.file_name < second.file_name) { - return false; - } else { - return (first.file_path > second.file_path); - } } -} // namespace // Delete obsolete files and log status and information of file deletion void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, @@ -445,7 +439,16 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // dedup state.candidate_files so we don't try to delete the same // file twice std::sort(candidate_files.begin(), candidate_files.end(), - CompareCandidateFile); + [](const JobContext::CandidateFileInfo& lhs, + const JobContext::CandidateFileInfo& rhs) { + if (lhs.file_name > rhs.file_name) { + return true; + } else if (lhs.file_name < rhs.file_name) { + return false; + } else { + return (lhs.file_path > rhs.file_path); + } + }); candidate_files.erase( std::unique(candidate_files.begin(), candidate_files.end()), candidate_files.end()); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 69fd17183..47d558e57 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1459,9 +1459,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { Status s; mutex_.AssertHeld(); assert(immutable_db_options_.avoid_flush_during_recovery); - if (two_write_queues_) { - log_write_mutex_.Lock(); - } // Mark these as alive so they'll be considered for deletion later by // FindObsoleteFiles() total_log_size_ = 0; @@ -1486,9 +1483,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { total_log_size_ += log.size; alive_log_files_.push_back(log); } - if (two_write_queues_) { - log_write_mutex_.Unlock(); - } return s; } @@ -1871,16 +1865,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { - if (impl->two_write_queues_) { - impl->log_write_mutex_.Lock(); - } impl->alive_log_files_.push_back( DBImpl::LogFileNumberSize(impl->logfile_number_)); - if (impl->two_write_queues_) { - impl->log_write_mutex_.Unlock(); - } - } - if (s.ok()) { // In WritePrepared there could be gap in sequence numbers. This breaks // the trick we use in kPointInTimeRecovery which assumes the first seq in // the log right after the corrupted log is one larger than the last seq diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 2f46efb9b..0f539e2fa 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -349,14 +349,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // when it finds suitable, and finish them in the same write batch. // This is how a write job could be done by the other writer. WriteContext write_context; + LogContext log_context(write_options.sync); WriteThread::WriteGroup write_group; bool in_parallel_group = false; uint64_t last_sequence = kMaxSequenceNumber; - mutex_.Lock(); - - bool need_log_sync = write_options.sync; - bool need_log_dir_sync = need_log_sync && !log_dir_synced_; assert(!two_write_queues_ || !disable_memtable); { // With concurrent writes we do preprocess only in the write thread that @@ -366,7 +363,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // PreprocessWrite does its own perf timing. PERF_TIMER_STOP(write_pre_and_post_process_time); - status = PreprocessWrite(write_options, &need_log_sync, &write_context); + status = PreprocessWrite(write_options, &log_context, &write_context); if (!two_write_queues_) { // Assign it after ::PreprocessWrite since the sequence might advance // inside it by WriteRecoverableState @@ -376,13 +373,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, PERF_TIMER_START(write_pre_and_post_process_time); } - log::Writer* log_writer = logs_.back().writer; - LogFileNumberSize& log_file_number_size = alive_log_files_.back(); - - assert(log_writer->get_log_number() == log_file_number_size.number); - - mutex_.Unlock(); - // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging // and protects against concurrent loggers and concurrent writes @@ -477,10 +467,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (!two_write_queues_) { if (status.ok() && !write_options.disableWAL) { + assert(log_context.log_file_number_size); + LogFileNumberSize& log_file_number_size = + *(log_context.log_file_number_size); PERF_TIMER_GUARD(write_wal_time); - io_s = WriteToWAL(write_group, log_writer, log_used, need_log_sync, - need_log_dir_sync, last_sequence + 1, - log_file_number_size); + io_s = + WriteToWAL(write_group, log_context.writer, log_used, + log_context.need_log_sync, log_context.need_log_dir_sync, + last_sequence + 1, log_file_number_size); } } else { if (status.ok() && !write_options.disableWAL) { @@ -582,14 +576,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, assert(pre_release_cb_status.ok()); } - if (need_log_sync) { - mutex_.Lock(); + if (log_context.need_log_sync) { + VersionEdit synced_wals; + log_write_mutex_.Lock(); if (status.ok()) { - status = MarkLogsSynced(logfile_number_, need_log_dir_sync); + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + &synced_wals); } else { MarkLogsNotSynced(logfile_number_); } - mutex_.Unlock(); + log_write_mutex_.Unlock(); + if (status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + status = ApplyWALToManifest(&synced_wals); + } + // Requesting sync with two_write_queues_ is expected to be very rare. We // hence provide a simple implementation that is not necessarily efficient. if (two_write_queues_) { @@ -652,19 +653,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, if (w.callback && !w.callback->AllowWriteBatching()) { write_thread_.WaitForMemTableWriters(); } - mutex_.Lock(); - bool need_log_sync = !write_options.disableWAL && write_options.sync; - bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + LogContext log_context(!write_options.disableWAL && write_options.sync); // PreprocessWrite does its own perf timing. PERF_TIMER_STOP(write_pre_and_post_process_time); - w.status = PreprocessWrite(write_options, &need_log_sync, &write_context); + w.status = PreprocessWrite(write_options, &log_context, &write_context); PERF_TIMER_START(write_pre_and_post_process_time); - log::Writer* log_writer = logs_.back().writer; - LogFileNumberSize& log_file_number_size = alive_log_files_.back(); - - assert(log_writer->get_log_number() == log_file_number_size.number); - - mutex_.Unlock(); // This can set non-OK status if callback fail. last_batch_group_size_ = @@ -727,9 +720,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, wal_write_group.size - 1); RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); } + assert(log_context.log_file_number_size); + LogFileNumberSize& log_file_number_size = + *(log_context.log_file_number_size); io_s = - WriteToWAL(wal_write_group, log_writer, log_used, need_log_sync, - need_log_dir_sync, current_sequence, log_file_number_size); + WriteToWAL(wal_write_group, log_context.writer, log_used, + log_context.need_log_sync, log_context.need_log_dir_sync, + current_sequence, log_file_number_size); w.status = io_s; } @@ -740,16 +737,20 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, WriteStatusCheck(w.status); } - if (need_log_sync) { - mutex_.Lock(); + VersionEdit synced_wals; + if (log_context.need_log_sync) { + InstrumentedMutexLock l(&log_write_mutex_); if (w.status.ok()) { - w.status = MarkLogsSynced(logfile_number_, need_log_dir_sync); + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + &synced_wals); } else { MarkLogsNotSynced(logfile_number_); } - mutex_.Unlock(); } - + if (w.status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + w.status = ApplyWALToManifest(&synced_wals); + } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } @@ -893,9 +894,8 @@ Status DBImpl::WriteImplWALOnly( // TODO(myabandeh): Make preliminary checks thread-safe so we could do them // without paying the cost of obtaining the mutex. if (status.ok()) { - InstrumentedMutexLock l(&mutex_); - bool need_log_sync = false; - status = PreprocessWrite(write_options, &need_log_sync, &write_context); + LogContext log_context; + status = PreprocessWrite(write_options, &log_context, &write_context); WriteStatusCheckOnLocked(status); } if (!status.ok()) { @@ -1057,9 +1057,8 @@ Status DBImpl::WriteImplWALOnly( void DBImpl::WriteStatusCheckOnLocked(const Status& status) { // Is setting bg_error_ enough here? This will at least stop // compaction and fail any further writes. - // Caller must hold mutex_. + InstrumentedMutexLock l(&mutex_); assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok()); - mutex_.AssertHeld(); if (immutable_db_options_.paranoid_checks && !status.ok() && !status.IsBusy() && !status.IsIncomplete()) { // Maybe change the return status to void? @@ -1110,13 +1109,13 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) { } Status DBImpl::PreprocessWrite(const WriteOptions& write_options, - bool* need_log_sync, + LogContext* log_context, WriteContext* write_context) { - mutex_.AssertHeld(); - assert(write_context != nullptr && need_log_sync != nullptr); + assert(write_context != nullptr && log_context != nullptr); Status status; if (error_handler_.IsDBStopped()) { + InstrumentedMutexLock l(&mutex_); status = error_handler_.GetBGError(); } @@ -1124,11 +1123,11 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) { assert(versions_); + InstrumentedMutexLock l(&mutex_); const ColumnFamilySet* const column_families = versions_->GetColumnFamilySet(); assert(column_families); size_t num_cfs = column_families->NumberOfColumnFamilies(); - assert(num_cfs >= 1); if (num_cfs > 1) { WaitForPendingWrites(); @@ -1142,15 +1141,18 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // thread is writing to another DB with the same write buffer, they may also // be flushed. We may end up with flushing much more DBs than needed. It's // suboptimal but still correct. + InstrumentedMutexLock l(&mutex_); WaitForPendingWrites(); status = HandleWriteBufferManagerFlush(write_context); } if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { + InstrumentedMutexLock l(&mutex_); status = TrimMemtableHistory(write_context); } if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + InstrumentedMutexLock l(&mutex_); WaitForPendingWrites(); status = ScheduleFlushes(write_context); } @@ -1166,6 +1168,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // for previous one. It might create a fairness issue that expiration // might happen for smaller writes but larger writes can go through. // Can optimize it if it is an issue. + InstrumentedMutexLock l(&mutex_); status = DelayWrite(last_batch_group_size_, write_options); PERF_TIMER_START(write_pre_and_post_process_time); } @@ -1180,11 +1183,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, if (write_options.no_slowdown) { status = Status::Incomplete("Write stall"); } else { + InstrumentedMutexLock l(&mutex_); WriteBufferManagerStallWrites(); } } - - if (status.ok() && *need_log_sync) { + InstrumentedMutexLock l(&log_write_mutex_); + if (status.ok() && log_context->need_log_sync) { // Wait until the parallel syncs are finished. Any sync process has to sync // the front log too so it is enough to check the status of front() // We do a while loop since log_sync_cv_ is signalled when any sync is @@ -1204,8 +1208,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, log.PrepareForSync(); } } else { - *need_log_sync = false; + log_context->need_log_sync = false; } + log_context->writer = logs_.back().writer; + log_context->need_log_dir_sync = + log_context->need_log_dir_sync && !log_dir_synced_; + log_context->log_file_number_size = std::addressof(alive_log_files_.back()); return status; } @@ -1714,10 +1722,12 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) { } uint64_t DBImpl::GetMaxTotalWalSize() const { - mutex_.AssertHeld(); - return mutable_db_options_.max_total_wal_size == 0 - ? 4 * max_total_in_memory_state_ - : mutable_db_options_.max_total_wal_size; + uint64_t max_total_wal_size = + max_total_wal_size_.load(std::memory_order_acquire); + if (max_total_wal_size > 0) { + return max_total_wal_size; + } + return 4 * max_total_in_memory_state_.load(std::memory_order_acquire); } // REQUIRES: mutex_ is held @@ -2065,7 +2075,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { log_recycle_files_.pop_front(); } if (s.ok() && creating_new_log) { - log_write_mutex_.Lock(); + InstrumentedMutexLock l(&log_write_mutex_); assert(new_log != nullptr); if (!logs_.empty()) { // Alway flush the buffer of the last log before switching to a new one @@ -2089,7 +2099,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { logs_.emplace_back(logfile_number_, new_log); alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); } - log_write_mutex_.Unlock(); } if (!s.ok()) { diff --git a/db/error_handler.cc b/db/error_handler.cc index b00611ac9..1df01267f 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -358,6 +358,9 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, RecoverFromNoSpace(); } } + if (bg_error_.severity() >= Status::Severity::kHardError) { + is_db_stopped_.store(true, std::memory_order_release); + } return bg_error_; } @@ -736,6 +739,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { // the bg_error and notify user. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); Status old_bg_error = bg_error_; + is_db_stopped_.store(false, std::memory_order_release); bg_error_ = Status::OK(); bg_error_.PermitUncheckedError(); EventHelpers::NotifyOnErrorRecoveryEnd( @@ -792,6 +796,9 @@ void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { if (bg_err.severity() > bg_error_.severity()) { bg_error_ = bg_err; } + if (bg_error_.severity() >= Status::Severity::kHardError) { + is_db_stopped_.store(true, std::memory_order_release); + } return; } diff --git a/db/error_handler.h b/db/error_handler.h index 9dab56e91..e7c47b763 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -38,6 +38,7 @@ class ErrorHandler { auto_recovery_(false), recovery_in_prog_(false), soft_error_no_bg_work_(false), + is_db_stopped_(false), bg_error_stats_(db_options.statistics) { // Clear the checked flag for uninitialized errors bg_error_.PermitUncheckedError(); @@ -59,16 +60,15 @@ class ErrorHandler { Status ClearBGError(); - bool IsDBStopped() { - return !bg_error_.ok() && - bg_error_.severity() >= Status::Severity::kHardError; - } + bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); } - bool IsBGWorkStopped() { - return !bg_error_.ok() && - (bg_error_.severity() >= Status::Severity::kHardError || - !auto_recovery_ || soft_error_no_bg_work_); - } + bool IsBGWorkStopped() { + assert(db_mutex_); + db_mutex_->AssertHeld(); + return !bg_error_.ok() && + (bg_error_.severity() >= Status::Severity::kHardError || + !auto_recovery_ || soft_error_no_bg_work_); + } bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; } @@ -105,6 +105,7 @@ class ErrorHandler { // Used to store the context for recover, such as flush reason. DBRecoverContext recover_context_; + std::atomic is_db_stopped_; // The pointer of DB statistics. std::shared_ptr bg_error_stats_; diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 3bb8dd53c..69386fbd2 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -390,7 +390,7 @@ void ProfileQueries(bool enabled_time = false) { EXPECT_GT(hist_write_scheduling_time.Average(), 0); #ifndef NDEBUG - ASSERT_GT(total_db_mutex_nanos, 2000U); + ASSERT_LT(total_db_mutex_nanos, 100U); #endif } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 47bfe5ee8..5d08e95ec 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1978,6 +1978,7 @@ TEST_F(VersionSetTest, WalCreateAfterClose) { TEST_F(VersionSetTest, AddWalWithSmallerSize) { NewDB(); + assert(versions_); constexpr WalNumber kLogNumber = 10; constexpr uint64_t kSizeInBytes = 111; @@ -1990,6 +1991,9 @@ TEST_F(VersionSetTest, AddWalWithSmallerSize) { ASSERT_OK(LogAndApplyToDefaultCF(edit)); } + // Copy for future comparison. + const std::map wals1 = + versions_->GetWalSet().GetWals(); { // Add the same WAL with smaller synced size. @@ -1998,13 +2002,11 @@ TEST_F(VersionSetTest, AddWalWithSmallerSize) { edit.AddWal(kLogNumber, wal); Status s = LogAndApplyToDefaultCF(edit); - ASSERT_TRUE(s.IsCorruption()); - ASSERT_TRUE( - s.ToString().find( - "WAL 10 must not have smaller synced size than previous one") != - std::string::npos) - << s.ToString(); + ASSERT_OK(s); } + const std::map wals2 = + versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals1, wals2); } TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) { diff --git a/db/wal_edit.cc b/db/wal_edit.cc index 786d68b5c..2525be610 100644 --- a/db/wal_edit.cc +++ b/db/wal_edit.cc @@ -112,26 +112,33 @@ Status WalSet::AddWal(const WalAddition& wal) { auto it = wals_.lower_bound(wal.GetLogNumber()); bool existing = it != wals_.end() && it->first == wal.GetLogNumber(); - if (existing && !wal.GetMetadata().HasSyncedSize()) { - std::stringstream ss; - ss << "WAL " << wal.GetLogNumber() << " is created more than once"; - return Status::Corruption("WalSet::AddWal", ss.str()); + + if (!existing) { + wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()}); + return Status::OK(); } - // If the WAL has synced size, it must >= the previous size. - if (wal.GetMetadata().HasSyncedSize() && existing && - it->second.HasSyncedSize() && - wal.GetMetadata().GetSyncedSizeInBytes() < - it->second.GetSyncedSizeInBytes()) { + + assert(existing); + if (!wal.GetMetadata().HasSyncedSize()) { std::stringstream ss; - ss << "WAL " << wal.GetLogNumber() - << " must not have smaller synced size than previous one"; + ss << "WAL " << wal.GetLogNumber() << " is created more than once"; return Status::Corruption("WalSet::AddWal", ss.str()); } - if (existing) { - it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes()); - } else { - wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()}); + + assert(wal.GetMetadata().HasSyncedSize()); + if (it->second.HasSyncedSize() && wal.GetMetadata().GetSyncedSizeInBytes() <= + it->second.GetSyncedSizeInBytes()) { + // This is possible because version edits with different synced WAL sizes + // for the same WAL can be committed out-of-order. For example, thread + // 1 synces the first 10 bytes of 1.log, while thread 2 synces the first 20 + // bytes of 1.log. It's possible that thread 1 calls LogAndApply() after + // thread 2. + // In this case, just return ok. + return Status::OK(); } + + // Update synced size for the given WAL. + it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes()); return Status::OK(); } diff --git a/db/wal_edit.h b/db/wal_edit.h index 23dc58905..bb5c5e292 100644 --- a/db/wal_edit.h +++ b/db/wal_edit.h @@ -42,6 +42,8 @@ class WalMetadata { uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; } private: + friend bool operator==(const WalMetadata& lhs, const WalMetadata& rhs); + friend bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs); // The size of WAL is unknown, used when the WAL is not synced yet or is // empty. constexpr static uint64_t kUnknownWalSize = @@ -51,6 +53,14 @@ class WalMetadata { uint64_t synced_size_bytes_ = kUnknownWalSize; }; +inline bool operator==(const WalMetadata& lhs, const WalMetadata& rhs) { + return lhs.synced_size_bytes_ == rhs.synced_size_bytes_; +} + +inline bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs) { + return !(lhs == rhs); +} + // These tags are persisted to MANIFEST, so it's part of the user API. enum class WalAdditionTag : uint32_t { // Indicates that there are no more tags. diff --git a/db/wal_edit_test.cc b/db/wal_edit_test.cc index 5895e180d..0c18fb125 100644 --- a/db/wal_edit_test.cc +++ b/db/wal_edit_test.cc @@ -54,12 +54,11 @@ TEST(WalSet, SmallerSyncedSize) { constexpr uint64_t kBytes = 100; WalSet wals; ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes)))); + const auto wals1 = wals.GetWals(); Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0))); - ASSERT_TRUE(s.IsCorruption()); - ASSERT_TRUE( - s.ToString().find( - "WAL 100 must not have smaller synced size than previous one") != - std::string::npos); + const auto wals2 = wals.GetWals(); + ASSERT_OK(s); + ASSERT_EQ(wals1, wals2); } TEST(WalSet, CreateTwice) { diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc index 1f49bdaf6..3f3567af7 100644 --- a/utilities/backup/backup_engine_test.cc +++ b/utilities/backup/backup_engine_test.cc @@ -1115,7 +1115,8 @@ TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) { destroy_data = false; // kAutoFlushOnly to preserve legacy test behavior (consider updating) FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly); - ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0)) + << "iter: " << iter << ", idx: " << i; CloseDBAndBackupEngine(); DestroyDB(dbname_, options_);