WritePrepared Txn: fix bug with Rollback seq

Summary:
The sequence number was not properly advanced after a rollback marker. The patch extends the existing unit tests to detect the bug and also fixes it.
Closes https://github.com/facebook/rocksdb/pull/3157

Differential Revision: D6304291

Pulled By: maysamyabandeh

fbshipit-source-id: 1b519c44a5371b802da49c9e32bd00087a8da401
main
Maysam Yabandeh 7 years ago committed by Facebook Github Bot
parent 175d5d6a9e
commit 53863b76f9
  1. 13
      db/write_batch.cc
  2. 1
      include/rocksdb/db.h
  3. 28
      utilities/transactions/transaction_test.cc
  4. 35
      utilities/transactions/transaction_test.h
  5. 59
      utilities/transactions/write_prepared_transaction_test.cc
  6. 10
      utilities/transactions/write_prepared_txn.cc
  7. 1
      utilities/transactions/write_prepared_txn_db.cc
  8. 6
      utilities/transactions/write_prepared_txn_db.h

@ -988,6 +988,16 @@ class MemTableInserter : public WriteBatch::Handler {
virtual bool WriterAfterCommit() const { return write_after_commit_; }
// The batch seq is regularly restarted; In normal mode it is set when
// MemTableInserter is constructed in the write thread and in recovery mode it
// is set when a batch, which is tagged with seq, is read from the WAL.
// Within a sequenced batch, which could be a merge of multiple batches, we
// have two policies to advance the seq: i) seq_per_key (default) and ii)
// seq_per_batch. To implement the latter we need to mark the boundry between
// the individual batches. The approach is this: 1) Use the terminating
// markers to indicate the boundry (kTypeEndPrepareXID, kTypeCommitXID,
// kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absense of a
// natural boundy marker.
void MaybeAdvanceSeq(bool batch_boundry = false) {
if (batch_boundry == seq_per_batch_) {
sequence_++;
@ -1430,6 +1440,9 @@ class MemTableInserter : public WriteBatch::Handler {
// in non recovery we simply ignore this tag
}
const bool batch_boundry = true;
MaybeAdvanceSeq(batch_boundry);
return Status::OK();
}

@ -924,6 +924,7 @@ class DB {
// Retrieve the sorted list of all wal files with earliest file first
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
// Note: this API is not yet consistent with WritePrepared transactions.
// Sets iter to an iterator that is positioned at a write-batch containing
// seq_number. If the sequence number is non existent, it returns an iterator
// at the first available seq_no after the requested seq_no

@ -4837,6 +4837,8 @@ TEST_P(TransactionTest, MemoryLimitTest) {
// necessarily the one acceptable way. If the algorithm is legitimately changed,
// this unit test should be updated as well.
TEST_P(TransactionTest, SeqAdvanceTest) {
// TODO(myabandeh): must be test with false before new releases
const bool short_test = true;
WriteOptions wopts;
FlushOptions fopt;
@ -4846,7 +4848,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
// Do the test with NUM_BRANCHES branches in it. Each run of a test takes some
// of the branches. This is the same as counting a binary number where i-th
// bit represents whether we take branch i in the represented by the number.
const size_t NUM_BRANCHES = 8;
const size_t NUM_BRANCHES = short_test ? 6 : 10;
// Helper function that shows if the branch is to be taken in the run
// represented by the number n.
auto branch_do = [&](size_t n, size_t* branch) {
@ -4869,7 +4871,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
}
if (branch_do(n, &branch)) {
if (!short_test && branch_do(n, &branch)) {
db_impl->FlushWAL(true);
ReOpenNoDelete();
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
@ -4891,7 +4893,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
}
if (branch_do(n, &branch)) {
if (!short_test && branch_do(n, &branch)) {
db_impl->FlushWAL(true);
ReOpenNoDelete();
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
@ -4908,7 +4910,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
}
if (branch_do(n, &branch)) {
if (!short_test && branch_do(n, &branch)) {
db_impl->FlushWAL(true);
ReOpenNoDelete();
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
@ -4916,10 +4918,24 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
ASSERT_EQ(exp_seq, seq);
}
txn_t0(0);
txn_t4(0);
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
if (branch_do(n, &branch)) {
db_impl->Flush(fopt);
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
}
if (!short_test && branch_do(n, &branch)) {
db_impl->FlushWAL(true);
ReOpenNoDelete();
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
seq = db_impl->GetLatestSequenceNumber();
ASSERT_EQ(exp_seq, seq);
}
txn_t2(0);
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
@ -4929,7 +4945,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
}
if (branch_do(n, &branch)) {
if (!short_test && branch_do(n, &branch)) {
db_impl->FlushWAL(true);
ReOpenNoDelete();
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());

@ -236,6 +236,41 @@ class TransactionTest : public ::testing::TestWithParam<
}
delete txn;
};
std::function<void(size_t)> txn_t4 = [&](size_t index) {
// A full 2pc txn that also involves a commit marker.
TransactionOptions txn_options;
WriteOptions write_options;
Transaction* txn = db->BeginTransaction(write_options, txn_options);
auto istr = std::to_string(index);
auto s = txn->SetName("xid" + istr);
ASSERT_OK(s);
s = txn->Put(Slice("foo" + istr), Slice("bar"));
s = txn->Put(Slice("foo2" + istr), Slice("bar2"));
s = txn->Put(Slice("foo3" + istr), Slice("bar3"));
s = txn->Put(Slice("foo4" + istr), Slice("bar4"));
s = txn->Put(Slice("foo5" + istr), Slice("bar5"));
ASSERT_OK(s);
expected_commits++;
s = txn->Prepare();
ASSERT_OK(s);
commit_writes++;
s = txn->Rollback();
ASSERT_OK(s);
if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
// No seq is consumed for deleting the txn buffer
exp_seq += 0;
} else {
// Consume one seq per batch
exp_seq++;
// Consume one seq per rollback batch
exp_seq++;
if (options.two_write_queues) {
// Consume one seq for rollback commit
exp_seq++;
}
}
delete txn;
};
// Test that we can change write policy after a clean shutdown (which would
// empty the WAL)

@ -605,12 +605,13 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
FlushOptions fopt;
// Number of different txn types we use in this test
const size_t type_cnt = 4;
const size_t type_cnt = 5;
// The size of the first write group
// TODO(myabandeh): This should be increase for pre-release tests
const size_t first_group_size = 2;
// Total number of txns we run in each test
const size_t txn_cnt = first_group_size * 2;
// TODO(myabandeh): This should be increase for pre-release tests
const size_t txn_cnt = first_group_size + 1;
size_t base[txn_cnt + 1] = {
1,
@ -675,6 +676,9 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
case 3:
threads.emplace_back(txn_t3, bi);
break;
case 4:
threads.emplace_back(txn_t3, bi);
break;
default:
assert(false);
}
@ -710,16 +714,30 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
// The latest seq might be due to a commit without prepare and hence not
// persisted in the WAL. To make the verification of seq after recovery
// easier we write in a transaction with prepare which makes the latest seq
// to be persisted via the commitmarker.
txn_t3(0);
// persisted in the WAL. We need to discount such seqs if they are not
// continued by any seq consued by a value write.
if (options.two_write_queues) {
WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
MutexLock l(&wp_db->seq_for_metadata_mutex_);
auto& vec = wp_db->seq_for_metadata;
std::sort(vec.begin(), vec.end());
// going backward discount any last seq consumed for metadata until we see
// a seq that is consumed for actualy key/values.
auto rit = vec.rbegin();
for (; rit != vec.rend(); ++rit) {
if (*rit == exp_seq) {
exp_seq--;
} else {
break;
}
}
}
// Check if recovery preserves the last sequence number
db_impl->FlushWAL(true);
ReOpenNoDelete();
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
seq = db_impl->GetLatestSequenceNumber();
seq = db_impl->TEST_GetLastVisibleSequence();
ASSERT_EQ(exp_seq, seq);
// Check if flush preserves the last sequence number
@ -1134,25 +1152,18 @@ TEST_P(WritePreparedTransactionTest, RollbackTest) {
ASSERT_SAME(db, s4, v4, "key4");
if (crash) {
// TODO(myabandeh): replace it with true crash (commented lines below)
// after compaction PR is landed.
delete txn;
auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
auto seq = db_impl->GetLatestSequenceNumber();
db_impl->FlushWAL(true);
ReOpenNoDelete();
wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
SequenceNumber prev_max = wp_db->max_evicted_seq_;
wp_db->AdvanceMaxEvictedSeq(prev_max, seq);
// delete txn;
// auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
// db_impl->FlushWAL(true);
// ReOpenNoDelete();
// wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
// txn = db->GetTransactionByName("xid0");
// ASSERT_FALSE(wp_db->delayed_prepared_empty_);
// ReadLock rl(&wp_db->prepared_mutex_);
// ASSERT_TRUE(wp_db->prepared_txns_.empty());
// ASSERT_FALSE(wp_db->delayed_prepared_.empty());
// ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) !=
// wp_db->delayed_prepared_.end());
txn = db->GetTransactionByName("xid0");
ASSERT_FALSE(wp_db->delayed_prepared_empty_);
ReadLock rl(&wp_db->prepared_mutex_);
ASSERT_TRUE(wp_db->prepared_txns_.empty());
ASSERT_FALSE(wp_db->delayed_prepared_.empty());
ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) !=
wp_db->delayed_prepared_.end());
}
ASSERT_SAME(db, s1, v1, "key1");

@ -93,7 +93,12 @@ Status WritePreparedTxn::CommitWithoutPrepareInternal() {
SequenceNumber WritePreparedTxn::GetACommitSeqNumber(SequenceNumber prep_seq) {
if (db_impl_->immutable_db_options().two_write_queues) {
return db_impl_->IncAndFetchSequenceNumber();
auto s = db_impl_->IncAndFetchSequenceNumber();
#ifndef NDEBUG
MutexLock l(&wpt_db_->seq_for_metadata_mutex_);
wpt_db_->seq_for_metadata.push_back(s);
#endif
return s;
} else {
return prep_seq;
}
@ -161,8 +166,6 @@ Status WritePreparedTxn::RollbackInternal() {
WriteBatch rollback_batch;
assert(GetId() != kMaxSequenceNumber);
assert(GetId() > 0);
// In the absence of Prepare markers, use Noop as a batch separator
WriteBatchInternal::InsertNoop(&rollback_batch);
// In WritePrepared, the txn is is the same as prepare seq
auto last_visible_txn = GetId() - 1;
struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
@ -227,6 +230,7 @@ Status WritePreparedTxn::RollbackInternal() {
if (!s.ok()) {
return s;
}
// The Rollback marker will be used as a batch separator
WriteBatchInternal::MarkRollback(&rollback_batch, name_);
const bool disable_memtable = true;
const uint64_t no_log_ref = 0;

@ -215,6 +215,7 @@ bool WritePreparedTxnDB::IsInSnapshot(uint64_t prep_seq,
}
{
// We should not normally reach here
// TODO(myabandeh): check only if snapshot_seq is in the list of snaphots
ReadLock rl(&old_commit_map_mutex_);
auto old_commit_entry = old_commit_map_.find(prep_seq);
if (old_commit_entry == old_commit_map_.end() ||

@ -179,6 +179,12 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
// Struct to hold ownership of snapshot and read callback for cleanup.
struct IteratorState;
#ifndef NDEBUG
// For unit tests we can track of the seq numbers that are used for metadata as opposed to actual key/values
std::vector<uint64_t> seq_for_metadata;
mutable port::Mutex seq_for_metadata_mutex_;
#endif
private:
friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
friend class WritePreparedTransactionTest_CheckAgainstSnapshotsTest_Test;

Loading…
Cancel
Save