Fix a compaction bug for write-prepared txn (#9061)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9061

In write-prepared txn, checking a sequence's visibility in a released (old)
snapshot may return "Snapshot released". Suppose we have two snapshots:

```
earliest_snap < earliest_write_conflict_snap
```

If we release `earliest_write_conflict_snap` but keep `earliest_snap` during
bottommost level compaction, then it is possible that certain sequence of
events can lead to a PUT being seq-zeroed followed by a SingleDelete of the
same key. This violates the ascending order of keys, and will cause data
inconsistency.

Reviewed By: ltamasi

Differential Revision: D31813017

fbshipit-source-id: dc68ba2541d1228489b93cf3edda5f37ed06f285
main
Yanqin Jin 3 years ago committed by Facebook GitHub Bot
parent f2d11b3fdc
commit fdf2a0d7eb
  1. 1
      HISTORY.md
  2. 59
      db/compaction/compaction_iterator.cc
  3. 10
      db/compaction/compaction_iterator.h
  4. 71
      utilities/transactions/write_prepared_transaction_test.cc

@ -5,6 +5,7 @@
### Bug Fixes
* Prevent a `CompactRange()` with `CompactRangeOptions::change_level == true` from possibly causing corruption to the LSM state (overlapping files within a level) when run in parallel with another manual compaction. Note that setting `force_consistency_checks == true` (the default) would cause the DB to enter read-only mode in this scenario and return `Status::Corruption`, rather than committing any corruption.
* Fixed a bug in CompactionIterator when write-prepared transaction is used. A released earliest write conflict snapshot may cause assertion failure in dbg mode and unexpected key in opt mode.
## 6.26.0 (2021-10-20)
### Bug Fixes

@ -520,6 +520,25 @@ void CompactionIterator::NextFromInput() {
// 2) We've already returned a record in this snapshot -OR-
// there are no earlier earliest_write_conflict_snapshot.
//
// A note about 2) above:
// we try to determine whether there is any earlier write conflict
// checking snapshot by calling DefinitelyInSnapshot() with seq and
// earliest_write_conflict_snapshot as arguments. For write-prepared
// and write-unprepared transactions, if earliest_write_conflict_snapshot
// is evicted from WritePreparedTxnDB::commit_cache, then
// DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns
// false, even if the seq is actually visible within
// earliest_write_conflict_snapshot. Consequently, CompactionIterator
// may try to zero out its sequence number, thus hitting assertion error
// in debug mode or cause incorrect DBIter return result.
// We observe that earliest_write_conflict_snapshot >= earliest_snapshot,
// and the seq zeroing logic depends on
// DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot
// determine whether seq is **definitely** in
// earliest_write_conflict_snapshot, then we can additionally check if
// seq is definitely in earliest_snapshot. If the latter holds, then the
// former holds too.
//
// Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to
// allow Transactions to do write-conflict checking (if we compacted away
// all keys, then we wouldn't know that a write happened in this
@ -598,7 +617,10 @@ void CompactionIterator::NextFromInput() {
valid_ = true;
} else if (has_outputted_key_ ||
DefinitelyInSnapshot(ikey_.sequence,
earliest_write_conflict_snapshot_)) {
earliest_write_conflict_snapshot_) ||
(earliest_snapshot_ < earliest_write_conflict_snapshot_ &&
DefinitelyInSnapshot(ikey_.sequence,
earliest_snapshot_))) {
// Found a matching value, we can drop the single delete and the
// value. It is safe to drop both records since we've already
// outputted a key in this snapshot, or there is no earlier
@ -1084,41 +1106,6 @@ inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() {
(ikey_.sequence < preserve_deletes_seqnum_);
}
bool CompactionIterator::IsInCurrentEarliestSnapshot(SequenceNumber sequence) {
assert(snapshot_checker_ != nullptr);
bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber ||
(earliest_snapshot_iter_ != snapshots_->end() &&
*earliest_snapshot_iter_ == earliest_snapshot_));
assert(pre_condition);
if (!pre_condition) {
ROCKS_LOG_FATAL(info_log_,
"Pre-Condition is not hold in IsInEarliestSnapshot");
}
auto in_snapshot =
snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
// Avoid the the current earliest_snapshot_ being return as
// earliest visible snapshot for the next value. So if a value's sequence
// is zero-ed out by PrepareOutput(), the next value will be compact out.
released_snapshots_.insert(earliest_snapshot_);
earliest_snapshot_iter_++;
if (earliest_snapshot_iter_ == snapshots_->end()) {
earliest_snapshot_ = kMaxSequenceNumber;
} else {
earliest_snapshot_ = *earliest_snapshot_iter_;
}
in_snapshot =
snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
}
assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) {
ROCKS_LOG_FATAL(info_log_,
"Unexpected released snapshot in IsInEarliestSnapshot");
}
return in_snapshot == SnapshotCheckerResult::kInSnapshot;
}
uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber(
const CompactionProxy* compaction) {
if (!compaction) {

@ -271,14 +271,10 @@ class CompactionIterator {
SnapshotCheckerResult::kInSnapshot;
}
bool IsInCurrentEarliestSnapshot(SequenceNumber sequence);
bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
bool InCurrentEarliestSnapshot(SequenceNumber seq);
// Extract user-defined timestamp from user key if possible and compare it
// with *full_history_ts_low_ if applicable.
inline void UpdateTimestampAndCompareWithFullHistoryLow() {
@ -439,10 +435,4 @@ inline bool CompactionIterator::DefinitelyNotInSnapshot(
SnapshotCheckerResult::kNotInSnapshot)));
}
inline bool CompactionIterator::InCurrentEarliestSnapshot(SequenceNumber seq) {
return ((seq) <= earliest_snapshot_ &&
(snapshot_checker_ == nullptr ||
LIKELY(IsInCurrentEarliestSnapshot(seq))));
}
} // namespace ROCKSDB_NAMESPACE

@ -2996,6 +2996,77 @@ TEST_P(WritePreparedTransactionTest,
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_P(WritePreparedTransactionTest,
ReleaseEarliestWriteConflictSnapshot_SingleDelete) {
constexpr size_t kSnapshotCacheBits = 7; // same as default
constexpr size_t kCommitCacheBits = 0; // minimum commit cache
UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
options.disable_auto_compactions = true;
ASSERT_OK(ReOpen());
ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
ASSERT_OK(db->Flush(FlushOptions()));
{
CompactRangeOptions cro;
cro.change_level = true;
cro.target_level = 2;
ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
}
std::unique_ptr<Transaction> txn;
txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions(),
/*old_txn=*/nullptr));
ASSERT_OK(txn->SetName("txn1"));
ASSERT_OK(txn->SingleDelete("b"));
ASSERT_OK(txn->Prepare());
ASSERT_OK(txn->Commit());
auto* snapshot1 = db->GetSnapshot();
// Bump seq of the db by performing writes so that
// earliest_snapshot_ < earliest_write_conflict_snapshot_ in
// CompactionIterator.
ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
// Create another snapshot for write conflict checking
std::unique_ptr<Transaction> txn2;
{
TransactionOptions txn_opts;
txn_opts.set_snapshot = true;
txn2.reset(
db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr));
}
// Bump seq so that the subsequent bg flush won't create a snapshot with the
// same seq as the previous snapshot for conflict checking.
ASSERT_OK(db->Put(WriteOptions(), "y", "dont"));
ASSERT_OK(db->Flush(FlushOptions()));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->SetCallBack(
"CompactionIterator::NextFromInput:SingleDelete:1", [&](void* /*arg*/) {
// Rolling back txn2 should release its snapshot(for ww checking).
ASSERT_OK(txn2->Rollback());
txn2.reset();
// Advance max_evicted_seq
ASSERT_OK(db->Put(WriteOptions(), "x", "value"));
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
/*end=*/nullptr));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
db->ReleaseSnapshot(snapshot1);
}
// A more complex test to verify compaction/flush should keep keys visible
// to snapshots.
TEST_P(WritePreparedTransactionTest,

Loading…
Cancel
Save