From b283f041f58deb87a7a0fa21005c963d0f0dee88 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 12 Jul 2022 17:16:57 -0700 Subject: [PATCH] Stop tracking syncing live WAL for performance (#10330) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: With https://github.com/facebook/rocksdb/issues/10087, applications calling `SyncWAL()` or writing with `WriteOptions::sync=true` can suffer from performance regression. This PR reverts to original behavior of tracking the syncing of closed WALs. After we revert back to old behavior, recovery, whether kPointInTime or kAbsoluteConsistency, may fail to detect corruption in synced WALs if the corruption is in the live WAL. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10330 Test Plan: make check Before https://github.com/facebook/rocksdb/issues/10087 ```bash fillsync : 750.269 micros/op 1332 ops/sec 75.027 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync : 776.492 micros/op 1287 ops/sec 77.649 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync [AVG 2 runs] : 1310 (± 44) ops/sec; 0.1 (± 0.0) MB/sec fillsync : 805.625 micros/op 1241 ops/sec 80.563 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync [AVG 3 runs] : 1287 (± 51) ops/sec; 0.1 (± 0.0) MB/sec fillsync [AVG 3 runs] : 1287 (± 51) ops/sec; 0.1 (± 0.0) MB/sec fillsync [MEDIAN 3 runs] : 1287 ops/sec; 0.1 MB/sec ``` Before this PR and after https://github.com/facebook/rocksdb/issues/10087 ```bash fillsync : 1479.601 micros/op 675 ops/sec 147.960 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync : 1626.080 micros/op 614 ops/sec 162.608 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync [AVG 2 runs] : 645 (± 59) ops/sec; 0.1 (± 0.0) MB/sec fillsync : 1588.402 micros/op 629 ops/sec 158.840 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync [AVG 3 runs] : 640 (± 35) ops/sec; 0.1 (± 0.0) MB/sec fillsync [AVG 3 runs] : 640 (± 35) ops/sec; 0.1 (± 0.0) MB/sec fillsync [MEDIAN 3 runs] : 629 ops/sec; 0.1 MB/sec ``` After this PR ```bash fillsync : 749.621 micros/op 1334 ops/sec 74.962 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync : 865.577 micros/op 1155 ops/sec 86.558 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync [AVG 2 runs] : 1244 (± 175) ops/sec; 0.1 (± 0.0) MB/sec fillsync : 845.837 micros/op 1182 ops/sec 84.584 seconds 100000 operations; 0.1 MB/s (100 ops) fillsync [AVG 3 runs] : 1223 (± 109) ops/sec; 0.1 (± 0.0) MB/sec fillsync [AVG 3 runs] : 1223 (± 109) ops/sec; 0.1 (± 0.0) MB/sec fillsync [MEDIAN 3 runs] : 1182 ops/sec; 0.1 MB/sec ``` Reviewed By: ajkr Differential Revision: D37725212 Pulled By: riversand963 fbshipit-source-id: 8fa7d13b3c7662be5d56351c42caf3266af937ae --- HISTORY.md | 1 + db/db_basic_test.cc | 4 +++- db/db_impl/db_impl.cc | 8 ++++---- include/rocksdb/options.h | 8 +++++++- tools/db_bench_tool.cc | 5 +++++ 5 files changed, 20 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 385f821f3..f90b460e1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -26,6 +26,7 @@ ## Behavior Change * In leveled compaction with dynamic levelling, level multiplier is not anymore adjusted due to oversized L0. Instead, compaction score is adjusted by increasing size level target by adding incoming bytes from upper levels. This would deprioritize compactions from upper levels if more data from L0 is coming. This is to fix some unnecessary full stalling due to drastic change of level targets, while not wasting write bandwidth for compaction while writes are overloaded. +* For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330). ## 7.4.0 (06/19/2022) ### Bug Fixes diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 8e71a49c6..2fda22cc3 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -4195,7 +4195,9 @@ TEST_F(DBBasicTest, VerifyFileChecksums) { ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); } -TEST_F(DBBasicTest, ManualWalSync) { +// TODO: re-enable after we provide finer-grained control for WAL tracking to +// meet the needs of different use cases, durability levels and recovery modes. +TEST_F(DBBasicTest, DISABLED_ManualWalSync) { Options options = CurrentOptions(); options.track_and_verify_wals_in_manifest = true; options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 9ce044e79..4f58b7703 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1442,12 +1442,12 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) { for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { auto& wal = *it; assert(wal.IsSyncing()); - if (immutable_db_options_.track_and_verify_wals_in_manifest && - wal.GetPreSyncSize() > 0) { - synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); - } if (logs_.size() > 1) { + if (immutable_db_options_.track_and_verify_wals_in_manifest && + wal.GetPreSyncSize() > 0) { + synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); + } logs_to_free_.push_back(wal.ReleaseWriter()); // To modify logs_ both mutex_ and log_write_mutex_ must be held InstrumentedMutexLock l(&log_write_mutex_); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 18016e734..321e4f8e4 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -501,11 +501,17 @@ struct DBOptions { bool flush_verify_memtable_count = true; // If true, the log numbers and sizes of the synced WALs are tracked - // in MANIFEST, then during DB recovery, if a synced WAL is missing + // in MANIFEST. During DB recovery, if a synced WAL is missing // from disk, or the WAL's size does not match the recorded size in // MANIFEST, an error will be reported and the recovery will be aborted. // + // This is one additional protection against WAL corruption besides the + // per-WAL-entry checksum. + // // Note that this option does not work with secondary instance. + // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`, + // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not + // tracked for performance/efficiency reasons. // // Default: false bool track_and_verify_wals_in_manifest = false; diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 37c1d3677..6fe45bb40 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1681,6 +1681,9 @@ DEFINE_uint32(write_batch_protection_bytes_per_key, 0, DEFINE_bool(build_info, false, "Print the build info via GetRocksBuildInfoAsString"); +DEFINE_bool(track_and_verify_wals_in_manifest, false, + "If true, enable WAL tracking in the MANIFEST"); + namespace ROCKSDB_NAMESPACE { namespace { static Status CreateMemTableRepFactory( @@ -4485,6 +4488,8 @@ class Benchmark { } options.allow_data_in_errors = FLAGS_allow_data_in_errors; + options.track_and_verify_wals_in_manifest = + FLAGS_track_and_verify_wals_in_manifest; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files;