Fix a race condition in transaction stress test (#10157)

Summary:
Before this PR, there can be a race condition between the thread calling
`StressTest::Open()` and a background compaction thread calling
`MultiOpsTxnsStressTest::VerifyPkSkFast()`.

```
Time   thread1                             bg_compact_thr
 |     TransactionDB::Open(..., &txn_db_)
 |     db_ is still nullptr
 |                                         db_->GetSnapshot()  // segfault
 |     db_ = txn_db_
 V
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10157

Test Plan: CI

Reviewed By: akankshamahajan15

Differential Revision: D37121653

Pulled By: riversand963

fbshipit-source-id: 6a53117f958e9ee86f77297fdeb843e5160a9331
main
Yanqin Jin 3 years ago committed by Facebook GitHub Bot
parent c0e0f30667
commit bfaf8291c5
  1. 7
      db_stress_tool/db_stress_test_base.cc
  2. 4
      db_stress_tool/db_stress_test_base.h
  3. 12
      db_stress_tool/multi_ops_txns_stress.cc

@ -66,6 +66,7 @@ StressTest::StressTest()
#ifndef ROCKSDB_LITE
txn_db_(nullptr),
#endif
db_aptr_(nullptr),
clock_(db_stress_env->GetSystemClock().get()),
new_column_family_name_(1),
num_times_reopened_(0),
@ -2541,7 +2542,13 @@ void StressTest::Open(SharedState* shared) {
fflush(stderr);
}
assert(s.ok());
// Do not swap the order of the following.
{
db_ = txn_db_;
db_aptr_.store(txn_db_, std::memory_order_release);
}
// after a crash, rollback to commit recovered transactions
std::vector<Transaction*> trans;
txn_db_->GetAllPreparedTransactions(&trans);

@ -236,6 +236,10 @@ class StressTest {
#ifndef ROCKSDB_LITE
TransactionDB* txn_db_;
#endif
// Currently only used in MultiOpsTxnsStressTest
std::atomic<DB*> db_aptr_;
Options options_;
SystemClock* clock_;
std::vector<ColumnFamilyHandle*> column_families_;

@ -1248,7 +1248,19 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const {
}
}
// VerifyPkSkFast() can be called by MultiOpsTxnsStressListener's callbacks
// which can be called before TransactionDB::Open() returns to caller.
// Therefore, at that time, db_ and txn_db_ may still be nullptr.
// Caller has to make sure that the race condition does not happen.
void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) {
DB* const db = db_aptr_.load(std::memory_order_acquire);
if (db == nullptr) {
return;
}
assert(db_ == db);
assert(db_ != nullptr);
const Snapshot* const snapshot = db_->GetSnapshot();
assert(snapshot);
ManagedSnapshot snapshot_guard(db_, snapshot);

Loading…
Cancel
Save