From f24c39ab3d1536285d87e57696718226d1469b50 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Wed, 27 Oct 2021 23:07:29 -0700 Subject: [PATCH] Prevent corruption with parallel manual compactions and `change_level == true` (#9077) Summary: The bug can impact the following scenario. There must be two `CompactRange()`s, call them A and B. Compaction A must have `change_level=true`. Compactions A and B must run in parallel, and new data must be added while they run as well. Now, on to the details of the race condition. Compaction A must reach the refitting phase while B's next step is to trivial move new data (i.e., data that has been inserted behind A) down to the same level that A's refit targets (`CompactRangeOptions::target_level`). B must be unregistered (i.e., has not yet called `AddManualCompaction()` for the current `RunManualCompaction()`) while A invokes `DisableManualCompaction()`s to prepare for refitting. In the old code, B could still proceed to register a manual compaction, while A had disabled manual compaction. The next part of the race condition is B picks and schedules a trivial move while A has released the lock in refitting phase in order to persist the LSM state change (i.e., the log phase of `LogAndApply()`). That way, B does not see the refitted data when picking a trivial-move compaction. So it is susceptible to picking one that overlaps. Finally, B executes the picked trivial-move compaction. Trivial-move compactions are special in that they never check whether manual compaction is disabled. So the picked compaction causing overlap ends up being applied, leading to LSM corruption if `force_consistency_checks=false`, or entering read-only mode with `Status::Corruption` if `force_consistency_checks=true` (the default). The fix is just to prevent B from registering itself in `RunManualCompaction()` while manual compactions are disabled, consequently preventing any trivial move or other compaction from being picked/scheduled. Thanks to siying for finding the bug. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9077 Test Plan: The test does not go all the way in exposing the bug because it requires a compaction to be picked/scheduled while logging LSM state change for RefitLevel(). But the fix is to make such a compaction not picked/scheduled in the first place, so any repro of that scenario would end up hanging RefitLevel() logging. So instead I just verified no such compaction is registered in the scenario where `RefitLevel()` disables manual compactions. Reviewed By: siying Differential Revision: D31921908 Pulled By: ajkr fbshipit-source-id: 9bb5d0e847ad428211227f40830c685c209fbecb --- HISTORY.md | 4 ++ db/db_compaction_test.cc | 76 ++++++++++++++++++++++++++ db/db_impl/db_impl_compaction_flush.cc | 14 +++++ 3 files changed, 94 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index e20260798..c4e9f43a0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,8 @@ # Rocksdb Change Log +## Unreleased +### Bug Fixes +* Prevent a `CompactRange()` with `CompactRangeOptions::change_level == true` from possibly causing corruption to the LSM state (overlapping files within a level) when run in parallel with another manual compaction. Note that setting `force_consistency_checks == true` (the default) would cause the DB to enter read-only mode in this scenario and return `Status::Corruption`, rather than committing any corruption. + ## 6.26.0 (2021-10-20) ### Bug Fixes * Fixes a bug in directed IO mode when calling MultiGet() for blobs in the same blob file. The bug is caused by not sorting the blob read requests by file offsets. diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 41d12edd4..d376592f1 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -6863,6 +6863,82 @@ TEST_F(DBCompactionTest, ASSERT_TRUE(callback_completed); } +TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) { + Options options = CurrentOptions(); + options.num_levels = 3; + Reopen(options); + + // Setup an LSM with L2 populated. + Random rnd(301); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // The background thread will refit L2->L1 while the foreground thread will + // attempt to run a compaction on new data. The following dependencies + // ensure the background manual compaction's refitting phase disables manual + // compaction immediately before the foreground manual compaction can register + // itself. Manual compaction is kept disabled until the foreground manual + // checks for the failure once. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + // Only do Put()s for foreground CompactRange() once the background + // CompactRange() has reached the refitting phase. + { + "DBImpl::CompactRange:BeforeRefit:1", + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange", + }, + // Right before we register the manual compaction, proceed with + // the refitting phase so manual compactions are disabled. Stay in + // the refitting phase with manual compactions disabled until it is + // noticed. + { + "DBImpl::RunManualCompaction:0", + "DBImpl::CompactRange:BeforeRefit:2", + }, + { + "DBImpl::CompactRange:PreRefitLevel", + "DBImpl::RunManualCompaction:1", + }, + { + "DBImpl::RunManualCompaction:PausedAtStart", + "DBImpl::CompactRange:PostRefitLevel", + }, + // If compaction somehow were scheduled, let's let it run after reenabling + // manual compactions. This dependency is not expected to be hit but is + // here for speculatively coercing future bugs. + { + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled", + "BackgroundCallCompaction:0", + }, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange"); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + + refit_level_thread.join(); +} + #endif // !defined(ROCKSDB_LITE) } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index bb56c64a1..6b23f926f 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1110,6 +1110,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, assert(temp_s.ok()); } EnableManualCompaction(); + TEST_SYNC_POINT( + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled"); } LogFlush(immutable_db_options_.info_log); @@ -1743,6 +1745,17 @@ Status DBImpl::RunManualCompaction( TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); InstrumentedMutexLock l(&mutex_); + if (manual_compaction_paused_ > 0) { + // Does not make sense to `AddManualCompaction()` in this scenario since + // `DisableManualCompaction()` just waited for the manual compaction queue + // to drain. So return immediately. + TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart"); + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + manual.done = true; + return manual.status; + } + // When a manual compaction arrives, temporarily disable scheduling of // non-manual compactions and wait until the number of scheduled compaction // jobs drops to zero. This used to be needed to ensure that this manual @@ -3384,6 +3397,7 @@ bool DBImpl::HasPendingManualCompaction() { } void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) { + assert(manual_compaction_paused_ == 0); manual_compaction_dequeue_.push_back(m); }