// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #include #include #include #include #include #include "db/db_test_util.h" #include "db/write_batch_internal.h" #include "db/write_thread.h" #include "port/port.h" #include "port/stack_trace.h" #include "test_util/sync_point.h" #include "util/random.h" #include "util/string_util.h" #include "utilities/fault_injection_env.h" namespace ROCKSDB_NAMESPACE { // Test variations of WriteImpl. class DBWriteTest : public DBTestBase, public testing::WithParamInterface { public: DBWriteTest() : DBTestBase("db_write_test", /*env_do_fsync=*/true) {} Options GetOptions() { return DBTestBase::GetOptions(GetParam()); } void Open() { DBTestBase::Reopen(GetOptions()); } }; class DBWriteTestUnparameterized : public DBTestBase { public: explicit DBWriteTestUnparameterized() : DBTestBase("pipelined_write_test", /*env_do_fsync=*/false) {} }; // It is invalid to do sync write while disabling WAL. TEST_P(DBWriteTest, SyncAndDisableWAL) { WriteOptions write_options; write_options.sync = true; write_options.disableWAL = true; ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument()); WriteBatch batch; ASSERT_OK(batch.Put("foo", "bar")); ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument()); } TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { Options options = GetOptions(); options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4; std::vector threads; std::atomic thread_num(0); port::Mutex mutex; port::CondVar cv(&mutex); // Guarded by mutex int writers = 0; Reopen(options); std::function write_slowdown_func = [&]() { int a = thread_num.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = false; ASSERT_OK(dbfull()->Put(wo, key, "bar")); }; std::function write_no_slowdown_func = [&]() { int a = thread_num.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = true; Status s = dbfull()->Put(wo, key, "bar"); ASSERT_TRUE(s.ok() || s.IsIncomplete()); }; std::function unblock_main_thread_func = [&](void*) { mutex.Lock(); ++writers; cv.SignalAll(); mutex.Unlock(); }; // Create 3 L0 files and schedule 4th without waiting ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1", "DBImpl::BackgroundCallFlush:start"}, {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2", "DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"}, // Make compaction start wait for the write stall to be detected and // implemented by a write group leader {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3", "BackgroundCallCompaction:0"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Schedule creation of 4th L0 file without waiting. This will seal the // memtable and then wait for a sync point before writing the file. We need // to do it this way because SwitchMemtable() needs to enter the // write_thread FlushOptions fopt; fopt.wait = false; ASSERT_OK(dbfull()->Flush(fopt)); // Create a mix of slowdown/no_slowdown write threads mutex.Lock(); // First leader threads.emplace_back(write_slowdown_func); while (writers != 1) { cv.Wait(); } // Second leader. Will stall writes // Build a writers list with no slowdown in the middle: // +-------------+ // | slowdown +<----+ newest // +--+----------+ // | // v // +--+----------+ // | no slowdown | // +--+----------+ // | // v // +--+----------+ // | slowdown + // +-------------+ threads.emplace_back(write_slowdown_func); while (writers != 2) { cv.Wait(); } threads.emplace_back(write_no_slowdown_func); while (writers != 3) { cv.Wait(); } threads.emplace_back(write_slowdown_func); while (writers != 4) { cv.Wait(); } mutex.Unlock(); TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1"); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); // This would have triggered a write stall. Unblock the write group leader TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2"); // The leader is going to create missing newer links. When the leader // finishes, the next leader is going to delay writes and fail writers with // no_slowdown TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3"); for (auto& t : threads) { t.join(); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { Options options = GetOptions(); options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4; std::vector threads; std::atomic thread_num(0); port::Mutex mutex; port::CondVar cv(&mutex); // Guarded by mutex int writers = 0; Reopen(options); std::function write_slowdown_func = [&]() { int a = thread_num.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = false; ASSERT_OK(dbfull()->Put(wo, key, "bar")); }; std::function write_no_slowdown_func = [&]() { int a = thread_num.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = true; Status s = dbfull()->Put(wo, key, "bar"); ASSERT_TRUE(s.ok() || s.IsIncomplete()); }; std::function unblock_main_thread_func = [&](void *) { mutex.Lock(); ++writers; cv.SignalAll(); mutex.Unlock(); }; // Create 3 L0 files and schedule 4th without waiting ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBWriteTest::WriteThreadHangOnWriteStall:1", "DBImpl::BackgroundCallFlush:start"}, {"DBWriteTest::WriteThreadHangOnWriteStall:2", "DBImpl::WriteImpl:BeforeLeaderEnters"}, // Make compaction start wait for the write stall to be detected and // implemented by a write group leader {"DBWriteTest::WriteThreadHangOnWriteStall:3", "BackgroundCallCompaction:0"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Schedule creation of 4th L0 file without waiting. This will seal the // memtable and then wait for a sync point before writing the file. We need // to do it this way because SwitchMemtable() needs to enter the // write_thread FlushOptions fopt; fopt.wait = false; ASSERT_OK(dbfull()->Flush(fopt)); // Create a mix of slowdown/no_slowdown write threads mutex.Lock(); // First leader threads.emplace_back(write_slowdown_func); while (writers != 1) { cv.Wait(); } // Second leader. Will stall writes threads.emplace_back(write_slowdown_func); threads.emplace_back(write_no_slowdown_func); threads.emplace_back(write_slowdown_func); threads.emplace_back(write_no_slowdown_func); threads.emplace_back(write_slowdown_func); while (writers != 6) { cv.Wait(); } mutex.Unlock(); TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1"); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); // This would have triggered a write stall. Unblock the write group leader TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2"); // The leader is going to create missing newer links. When the leader finishes, // the next leader is going to delay writes and fail writers with no_slowdown TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3"); for (auto& t : threads) { t.join(); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { constexpr int kNumThreads = 5; std::unique_ptr mock_env( new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); Reopen(options); std::atomic ready_count{0}; std::atomic leader_count{0}; std::vector threads; mock_env->SetFilesystemActive(false); // Wait until all threads linked to write threads, to make sure // all threads join the same batch group. SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { ready_count++; auto* w = reinterpret_cast(arg); if (w->state == WriteThread::STATE_GROUP_LEADER) { leader_count++; while (ready_count < kNumThreads) { // busy waiting } } }); SyncPoint::GetInstance()->EnableProcessing(); for (int i = 0; i < kNumThreads; i++) { threads.push_back(port::Thread( [&](int index) { // All threads should fail. auto res = Put("key" + std::to_string(index), "value"); if (options.manual_wal_flush) { ASSERT_TRUE(res.ok()); // we should see fs error when we do the flush // TSAN reports a false alarm for lock-order-inversion but Open and // FlushWAL are not run concurrently. Disabling this until TSAN is // fixed. // res = dbfull()->FlushWAL(false); // ASSERT_FALSE(res.ok()); } else { ASSERT_FALSE(res.ok()); } }, i)); } for (int i = 0; i < kNumThreads; i++) { threads[i].join(); } ASSERT_EQ(1, leader_count); // The Failed PUT operations can cause a BG error to be set. // Mark it as Checked for the ASSERT_STATUS_CHECKED dbfull()->Resume().PermitUncheckedError(); // Close before mock_env destruct. Close(); } TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) { // This test was written to trigger a race in ExitAsBatchGroupLeader in case // enable_pipelined_write_ was true. // Writers for which ShouldWriteToMemtable() evaluates to false are removed // from the write_group via CompleteFollower/ CompleteLeader. Writers in the // middle of the group are fully unlinked, but if that writers is the // last_writer, then we did not update the predecessor's link_older, i.e., // this writer was still reachable via newest_writer_. // // But the problem was, that CompleteFollower already wakes up the thread // owning that writer before the writer has been removed. This resulted in a // race - if the leader thread was fast enough, then everything was fine. // However, if the woken up thread finished the current write operation and // then performed yet another write, then a new writer instance was added // to newest_writer_. It is possible that the new writer is located on the // same address on stack, and if this happened, then we had a problem, // because the old code tried to find the last_writer in the list to unlink // it, which in this case produced a cycle in the list. // Whether two invocations of PipelinedWriteImpl() by the same thread actually // allocate the writer on the same address depends on the OS and/or compiler, // so it is rather hard to create a deterministic test for this. Options options = GetDefaultOptions(); options.create_if_missing = true; options.enable_pipelined_write = true; std::vector threads; std::atomic write_counter{0}; std::atomic active_writers{0}; std::atomic second_write_starting{false}; std::atomic second_write_in_progress{false}; std::atomic leader{nullptr}; std::atomic finished_WAL_write{false}; DestroyAndReopen(options); auto write_one_doc = [&]() { int a = write_counter.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; ASSERT_OK(dbfull()->Put(wo, key, "bar")); --active_writers; }; auto write_two_docs = [&]() { write_one_doc(); second_write_starting = true; write_one_doc(); }; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { if (second_write_starting.load()) { second_write_in_progress = true; return; } auto* w = reinterpret_cast(arg); if (w->state == WriteThread::STATE_GROUP_LEADER) { active_writers++; if (leader.load() == nullptr) { leader.store(w); while (active_writers.load() < 2) { // wait for another thread to join the write_group } } } else { // we disable the memtable for all followers so that they they are // removed from the write_group before enqueuing it for the memtable // write w->disable_memtable = true; active_writers++; } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::ExitAsBatchGroupLeader:Start", [&](void* arg) { auto* wg = reinterpret_cast(arg); if (wg->leader == leader && !finished_WAL_write) { finished_WAL_write = true; while (active_writers.load() < 3) { // wait for the new writer to be enqueued } } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters", [&](void* arg) { auto* wg = reinterpret_cast(arg); if (wg->leader == leader) { while (!second_write_in_progress.load()) { // wait for the old follower thread to start the next write } } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // start leader + one follower threads.emplace_back(write_one_doc); while (leader.load() == nullptr) { // wait for leader } // we perform two writes in the follower, so that for the second write // the thread reinserts a Writer with the same address threads.emplace_back(write_two_docs); // wait for the leader to enter ExitAsBatchGroupLeader while (!finished_WAL_write.load()) { // wait for write_group to have finished the WAL writes } // start another writer thread to be enqueued before the leader can // complete the writers from its write_group threads.emplace_back(write_one_doc); for (auto& t : threads) { t.join(); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_P(DBWriteTest, ManualWalFlushInEffect) { Options options = GetOptions(); Reopen(options); // try the 1st WAL created during open ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok()); ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); // try the 2nd wal created during SwitchWAL ASSERT_OK(dbfull()->TEST_SwitchWAL()); ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok()); ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); } TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) { // Repro race condition bug where unflushed WAL data extended the synced size // recorded to MANIFEST despite being unrecoverable. Options options = GetOptions(); std::unique_ptr fault_env( new FaultInjectionTestEnv(env_)); options.env = fault_env.get(); options.manual_wal_flush = true; options.track_and_verify_wals_in_manifest = true; Reopen(options); ASSERT_OK(Put("key1", "val1")); SyncPoint::GetInstance()->SetCallBack( "DBImpl::SyncWAL:Begin", [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(db_->FlushWAL(true /* sync */)); // Ensure callback ran. ASSERT_EQ("val2", Get("key2")); Close(); // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the // DB WAL. fault_env->DropUnsyncedFileData(); Reopen(options); // Need to close before `fault_env` goes out of scope. Close(); } TEST_P(DBWriteTest, InactiveWalFullySyncedBeforeUntracked) { // Repro bug where a WAL is appended and switched after // `FlushWAL(true /* sync */)`'s sync finishes and before it untracks fully // synced inactive logs. Previously such a WAL would be wrongly untracked // so the final append would never be synced. Options options = GetOptions(); std::unique_ptr fault_env( new FaultInjectionTestEnv(env_)); options.env = fault_env.get(); Reopen(options); ASSERT_OK(Put("key1", "val1")); SyncPoint::GetInstance()->SetCallBack( "DBImpl::SyncWAL:BeforeMarkLogsSynced:1", [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); ASSERT_OK(dbfull()->TEST_SwitchMemtable()); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(db_->FlushWAL(true /* sync */)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); ASSERT_OK(Put("key3", "val3")); ASSERT_OK(db_->FlushWAL(true /* sync */)); Close(); // Simulate full loss of unsynced data. This should drop nothing since we did // `FlushWAL(true /* sync */)` before `Close()`. fault_env->DropUnsyncedFileData(); Reopen(options); ASSERT_EQ("val1", Get("key1")); ASSERT_EQ("val2", Get("key2")); ASSERT_EQ("val3", Get("key3")); // Need to close before `fault_env` goes out of scope. Close(); } TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { std::unique_ptr mock_env( new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); Reopen(options); for (int i = 0; i < 2; i++) { // Forcibly fail WAL write for the first Put only. Subsequent Puts should // fail due to read-only mode mock_env->SetFilesystemActive(i != 0); auto res = Put("key" + std::to_string(i), "value"); // TSAN reports a false alarm for lock-order-inversion but Open and // FlushWAL are not run concurrently. Disabling this until TSAN is // fixed. /* if (options.manual_wal_flush && i == 0) { // even with manual_wal_flush the 2nd Put should return error because of // the read-only mode ASSERT_TRUE(res.ok()); // we should see fs error when we do the flush res = dbfull()->FlushWAL(false); } */ if (!options.manual_wal_flush) { ASSERT_NOK(res); } else { ASSERT_OK(res); } } // Close before mock_env destruct. Close(); } TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) { Random rnd(301); std::unique_ptr mock_env( new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); options.writable_file_max_buffer_size = 4 * 1024 * 1024; options.write_buffer_size = 3 * 512 * 1024; options.wal_bytes_per_sync = 256 * 1024; options.manual_wal_flush = true; Reopen(options); mock_env->SetFilesystemActive(false, Status::IOError("Not active")); Status s; for (int i = 0; i < 4 * 512; ++i) { s = Put(Key(i), rnd.RandomString(1024)); if (!s.ok()) { break; } } ASSERT_EQ(s.severity(), Status::Severity::kFatalError); mock_env->SetFilesystemActive(true); // Close before mock_env destruct. Close(); } // Test that db->LockWAL() flushes the WAL after locking. TEST_P(DBWriteTest, LockWalInEffect) { Options options = GetOptions(); Reopen(options); // try the 1st WAL created during open ASSERT_OK(Put("key" + std::to_string(0), "value")); ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); ASSERT_OK(dbfull()->LockWAL()); ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false)); ASSERT_OK(dbfull()->UnlockWAL()); // try the 2nd wal created during SwitchWAL ASSERT_OK(dbfull()->TEST_SwitchWAL()); ASSERT_OK(Put("key" + std::to_string(0), "value")); ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); ASSERT_OK(dbfull()->LockWAL()); ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false)); ASSERT_OK(dbfull()->UnlockWAL()); } TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { Options options = GetOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.statistics->set_stats_level(StatsLevel::kAll); Reopen(options); std::string wal_key_prefix = "WAL_KEY_"; std::string no_wal_key_prefix = "K_"; // 100 KB value each for NO-WAL operation std::string no_wal_value(1024 * 100, 'X'); // 1B value each for WAL operation std::string wal_value = "0"; std::thread threads[10]; for (int t = 0; t < 10; t++) { threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, no_wal_value, this] { for(int i = 0; i < 10; i++) { ROCKSDB_NAMESPACE::WriteOptions write_option_disable; write_option_disable.disableWAL = true; ROCKSDB_NAMESPACE::WriteOptions write_option_default; std::string no_wal_key = no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i); ASSERT_OK( this->Put(no_wal_key, no_wal_value, write_option_disable)); std::string wal_key = wal_key_prefix + std::to_string(i) + "_" + std::to_string(i); ASSERT_OK(this->Put(wal_key, wal_value, write_option_default)); ASSERT_OK(dbfull()->SyncWAL()); } return; }); } for (auto& t: threads) { t.join(); } uint64_t bytes_num = options.statistics->getTickerCount( ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES); // written WAL size should less than 100KB (even included HEADER & FOOTER overhead) ASSERT_LE(bytes_num, 1024 * 100); } INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, testing::Values(DBTestBase::kDefault, DBTestBase::kConcurrentWALWrites, DBTestBase::kPipelinedWrite)); } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); }