diff --git a/HISTORY.md b/HISTORY.md index 79f4baf3e..e20569d2a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ ### Bug Fixes * Prevent a `CompactRange()` with `CompactRangeOptions::change_level == true` from possibly causing corruption to the LSM state (overlapping files within a level) when run in parallel with another manual compaction. Note that setting `force_consistency_checks == true` (the default) would cause the DB to enter read-only mode in this scenario and return `Status::Corruption`, rather than committing any corruption. +* Fixed a bug in CompactionIterator when write-prepared transaction is used. A released earliest write conflict snapshot may cause assertion failure in dbg mode and unexpected key in opt mode. ## 6.26.0 (2021-10-20) ### Bug Fixes diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index ed7fb9251..70df36561 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -520,6 +520,25 @@ void CompactionIterator::NextFromInput() { // 2) We've already returned a record in this snapshot -OR- // there are no earlier earliest_write_conflict_snapshot. // + // A note about 2) above: + // we try to determine whether there is any earlier write conflict + // checking snapshot by calling DefinitelyInSnapshot() with seq and + // earliest_write_conflict_snapshot as arguments. For write-prepared + // and write-unprepared transactions, if earliest_write_conflict_snapshot + // is evicted from WritePreparedTxnDB::commit_cache, then + // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns + // false, even if the seq is actually visible within + // earliest_write_conflict_snapshot. Consequently, CompactionIterator + // may try to zero out its sequence number, thus hitting assertion error + // in debug mode or cause incorrect DBIter return result. + // We observe that earliest_write_conflict_snapshot >= earliest_snapshot, + // and the seq zeroing logic depends on + // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot + // determine whether seq is **definitely** in + // earliest_write_conflict_snapshot, then we can additionally check if + // seq is definitely in earliest_snapshot. If the latter holds, then the + // former holds too. + // // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to // allow Transactions to do write-conflict checking (if we compacted away // all keys, then we wouldn't know that a write happened in this @@ -598,7 +617,10 @@ void CompactionIterator::NextFromInput() { valid_ = true; } else if (has_outputted_key_ || DefinitelyInSnapshot(ikey_.sequence, - earliest_write_conflict_snapshot_)) { + earliest_write_conflict_snapshot_) || + (earliest_snapshot_ < earliest_write_conflict_snapshot_ && + DefinitelyInSnapshot(ikey_.sequence, + earliest_snapshot_))) { // Found a matching value, we can drop the single delete and the // value. It is safe to drop both records since we've already // outputted a key in this snapshot, or there is no earlier @@ -1084,41 +1106,6 @@ inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() { (ikey_.sequence < preserve_deletes_seqnum_); } -bool CompactionIterator::IsInCurrentEarliestSnapshot(SequenceNumber sequence) { - assert(snapshot_checker_ != nullptr); - bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber || - (earliest_snapshot_iter_ != snapshots_->end() && - *earliest_snapshot_iter_ == earliest_snapshot_)); - assert(pre_condition); - if (!pre_condition) { - ROCKS_LOG_FATAL(info_log_, - "Pre-Condition is not hold in IsInEarliestSnapshot"); - } - auto in_snapshot = - snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); - while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) { - // Avoid the the current earliest_snapshot_ being return as - // earliest visible snapshot for the next value. So if a value's sequence - // is zero-ed out by PrepareOutput(), the next value will be compact out. - released_snapshots_.insert(earliest_snapshot_); - earliest_snapshot_iter_++; - - if (earliest_snapshot_iter_ == snapshots_->end()) { - earliest_snapshot_ = kMaxSequenceNumber; - } else { - earliest_snapshot_ = *earliest_snapshot_iter_; - } - in_snapshot = - snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); - } - assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased); - if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) { - ROCKS_LOG_FATAL(info_log_, - "Unexpected released snapshot in IsInEarliestSnapshot"); - } - return in_snapshot == SnapshotCheckerResult::kInSnapshot; -} - uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber( const CompactionProxy* compaction) { if (!compaction) { diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index f550c8e36..7431e6688 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -271,14 +271,10 @@ class CompactionIterator { SnapshotCheckerResult::kInSnapshot; } - bool IsInCurrentEarliestSnapshot(SequenceNumber sequence); - bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot); bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot); - bool InCurrentEarliestSnapshot(SequenceNumber seq); - // Extract user-defined timestamp from user key if possible and compare it // with *full_history_ts_low_ if applicable. inline void UpdateTimestampAndCompareWithFullHistoryLow() { @@ -439,10 +435,4 @@ inline bool CompactionIterator::DefinitelyNotInSnapshot( SnapshotCheckerResult::kNotInSnapshot))); } -inline bool CompactionIterator::InCurrentEarliestSnapshot(SequenceNumber seq) { - return ((seq) <= earliest_snapshot_ && - (snapshot_checker_ == nullptr || - LIKELY(IsInCurrentEarliestSnapshot(seq)))); -} - } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index f5d77611a..db0a2a8e7 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -2996,6 +2996,77 @@ TEST_P(WritePreparedTransactionTest, SyncPoint::GetInstance()->ClearAllCallBacks(); } +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestWriteConflictSnapshot_SingleDelete) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + std::unique_ptr txn; + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr)); + ASSERT_OK(txn->SetName("txn1")); + ASSERT_OK(txn->SingleDelete("b")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + + auto* snapshot1 = db->GetSnapshot(); + + // Bump seq of the db by performing writes so that + // earliest_snapshot_ < earliest_write_conflict_snapshot_ in + // CompactionIterator. + ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare")); + + // Create another snapshot for write conflict checking + std::unique_ptr txn2; + { + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + txn2.reset( + db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr)); + } + + // Bump seq so that the subsequent bg flush won't create a snapshot with the + // same seq as the previous snapshot for conflict checking. + ASSERT_OK(db->Put(WriteOptions(), "y", "dont")); + + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* /*arg*/) { + // Rolling back txn2 should release its snapshot(for ww checking). + ASSERT_OK(txn2->Rollback()); + txn2.reset(); + // Advance max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + db->ReleaseSnapshot(snapshot1); +} + // A more complex test to verify compaction/flush should keep keys visible // to snapshots. TEST_P(WritePreparedTransactionTest,