WritePrepared Txn: cross-compatibility test

Summary:
Add tests to ensure that WritePrepared and WriteCommitted policies are cross compatible when the db WAL is empty. This is important when the admin want to switch between the policies. In such case, before the switch the admin needs to empty the WAL by i) committing/rollbacking all the pending transactions, ii) FlushMemTables
Closes https://github.com/facebook/rocksdb/pull/3118

Differential Revision: D6227247

Pulled By: maysamyabandeh

fbshipit-source-id: bcde3d92c1e89cda3b9cfa69f6a20af5d8993db7
main
Maysam Yabandeh 7 years ago committed by Facebook Github Bot
parent 857adf388f
commit 2edc92bc28
  1. 6
      db/db_impl_open.cc
  2. 7
      db/dbformat.h
  3. 34
      db/write_batch.cc
  4. 3
      db/write_batch_internal.h
  5. 4
      include/rocksdb/write_batch.h
  6. 119
      utilities/transactions/transaction_test.h
  7. 28
      utilities/transactions/write_prepared_transaction_test.cc
  8. 7
      utilities/transactions/write_prepared_txn.cc
  9. 6
      utilities/write_batch_with_index/write_batch_with_index.cc
  10. 4
      utilities/write_batch_with_index/write_batch_with_index_internal.cc

@ -728,6 +728,12 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
} }
if (!status.ok()) { if (!status.ok()) {
if (status.IsNotSupported()) {
// We should not treat NotSupported as corruption. It is rather a clear
// sign that we are processing a WAL that is produced by an incompatible
// version of the code.
return status;
}
if (immutable_db_options_.wal_recovery_mode == if (immutable_db_options_.wal_recovery_mode ==
WALRecoveryMode::kSkipAnyCorruptedRecords) { WALRecoveryMode::kSkipAnyCorruptedRecords) {
// We should ignore all errors unconditionally // We should ignore all errors unconditionally

@ -49,7 +49,12 @@ enum ValueType : unsigned char {
kTypeRangeDeletion = 0xF, // meta block kTypeRangeDeletion = 0xF, // meta block
kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only
kTypeBlobIndex = 0x11, // Blob DB only kTypeBlobIndex = 0x11, // Blob DB only
kMaxValue = 0x7F // Not used for storing records. // When the prepared record is also persisted in db, we use a different
// record. This is to ensure that the WAL that is generated by a WritePolicy
// is not mistakenly read by another, which would result into data
// inconsistency.
kTypeBeginPersistedPrepareXID = 0x12, // WAL only.
kMaxValue = 0x7F // Not used for storing records.
}; };
// Defined in dbformat.cc // Defined in dbformat.cc

@ -24,6 +24,7 @@
// kTypeEndPrepareXID // kTypeEndPrepareXID
// kTypeCommitXID varstring // kTypeCommitXID varstring
// kTypeRollbackXID varstring // kTypeRollbackXID varstring
// kTypeBeginPersistedPrepareXID varstring
// kTypeNoop // kTypeNoop
// varstring := // varstring :=
// len: varint32 // len: varint32
@ -353,6 +354,9 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
break; break;
case kTypeNoop: case kTypeNoop:
case kTypeBeginPrepareXID: case kTypeBeginPrepareXID:
// This indicates that the prepared batch is also persisted in the db.
// This is used in WritePreparedTxn
case kTypeBeginPersistedPrepareXID:
break; break;
case kTypeEndPrepareXID: case kTypeEndPrepareXID:
if (!GetLengthPrefixedSlice(input, xid)) { if (!GetLengthPrefixedSlice(input, xid)) {
@ -457,6 +461,24 @@ Status WriteBatch::Iterate(Handler* handler) const {
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
handler->MarkBeginPrepare(); handler->MarkBeginPrepare();
empty_batch = false; empty_batch = false;
if (!handler->WriteAfterCommit()) {
s = Status::NotSupported(
"WriteCommitted txn tag when write_after_commit_ is disabled (in "
"WritePrepared mode). If it is not due to corruption, the WAL "
"must be emptied before changing the WritePolicy.");
}
break;
case kTypeBeginPersistedPrepareXID:
assert(content_flags_.load(std::memory_order_relaxed) &
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
handler->MarkBeginPrepare();
empty_batch = false;
if (handler->WriteAfterCommit()) {
s = Status::NotSupported(
"WritePrepared txn tag when write_after_commit_ is enabled (in "
"default WriteCommitted mode). If it is not due to corruption, "
"the WAL must be emptied before changing the WritePolicy.");
}
break; break;
case kTypeEndPrepareXID: case kTypeEndPrepareXID:
assert(content_flags_.load(std::memory_order_relaxed) & assert(content_flags_.load(std::memory_order_relaxed) &
@ -575,7 +597,8 @@ Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
return Status::OK(); return Status::OK();
} }
Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) { Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
bool write_after_commit) {
// a manually constructed batch can only contain one prepare section // a manually constructed batch can only contain one prepare section
assert(b->rep_[12] == static_cast<char>(kTypeNoop)); assert(b->rep_[12] == static_cast<char>(kTypeNoop));
@ -587,7 +610,9 @@ Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) {
} }
// rewrite noop as begin marker // rewrite noop as begin marker
b->rep_[12] = static_cast<char>(kTypeBeginPrepareXID); b->rep_[12] =
static_cast<char>(write_after_commit ? kTypeBeginPrepareXID
: kTypeBeginPersistedPrepareXID);
b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID)); b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
PutLengthPrefixedSlice(&b->rep_, xid); PutLengthPrefixedSlice(&b->rep_, xid);
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
@ -920,6 +945,9 @@ class MemTableInserter : public WriteBatch::Handler {
return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_); return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
} }
protected:
virtual bool WriteAfterCommit() const override { return write_after_commit_; }
public: public:
// cf_mems should not be shared with concurrent inserters // cf_mems should not be shared with concurrent inserters
MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems, MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
@ -957,6 +985,8 @@ class MemTableInserter : public WriteBatch::Handler {
MemTableInserter(const MemTableInserter&) = delete; MemTableInserter(const MemTableInserter&) = delete;
MemTableInserter& operator=(const MemTableInserter&) = delete; MemTableInserter& operator=(const MemTableInserter&) = delete;
virtual bool WriterAfterCommit() const { return write_after_commit_; }
void MaybeAdvanceSeq(bool batch_boundry = false) { void MaybeAdvanceSeq(bool batch_boundry = false) {
if (batch_boundry == seq_per_batch_) { if (batch_boundry == seq_per_batch_) {
sequence_++; sequence_++;

@ -102,7 +102,8 @@ class WriteBatchInternal {
static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id, static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const Slice& value); const Slice& key, const Slice& value);
static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid); static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
const bool write_after_commit = true);
static Status MarkRollback(WriteBatch* batch, const Slice& xid); static Status MarkRollback(WriteBatch* batch, const Slice& xid);

@ -267,6 +267,10 @@ class WriteBatch : public WriteBatchBase {
// iteration is halted. Otherwise, it continues iterating. The default // iteration is halted. Otherwise, it continues iterating. The default
// implementation always returns true. // implementation always returns true.
virtual bool Continue(); virtual bool Continue();
protected:
friend class WriteBatch;
virtual bool WriteAfterCommit() const { return true; }
}; };
Status Iterate(Handler* handler) const; Status Iterate(Handler* handler) const;

@ -236,6 +236,125 @@ class TransactionTest : public ::testing::TestWithParam<
} }
delete txn; delete txn;
}; };
// Test that we can change write policy after a clean shutdown (which would
// empty the WAL)
void CrossCompatibilityTest(TxnDBWritePolicy from_policy,
TxnDBWritePolicy to_policy, bool empty_wal) {
TransactionOptions txn_options;
ReadOptions read_options;
WriteOptions write_options;
uint32_t index = 0;
Random rnd(1103);
options.write_buffer_size = 1024; // To create more sst files
std::unordered_map<std::string, std::string> committed_kvs;
Transaction* txn;
txn_db_options.write_policy = from_policy;
ReOpen();
for (int i = 0; i < 1024; i++) {
auto istr = std::to_string(index);
auto k = Slice("foo-" + istr).ToString();
auto v = Slice("bar-" + istr).ToString();
// For test the duplicate keys
auto v2 = Slice("bar2-" + istr).ToString();
auto type = rnd.Uniform(4);
Status s;
switch (type) {
case 0:
committed_kvs[k] = v;
s = db->Put(write_options, k, v);
ASSERT_OK(s);
committed_kvs[k] = v2;
s = db->Put(write_options, k, v2);
ASSERT_OK(s);
break;
case 1: {
WriteBatch wb;
committed_kvs[k] = v;
wb.Put(k, v);
// TODO(myabandeh): remove this when we supprot duplicate keys in
// db->Write method
if (false) {
committed_kvs[k] = v2;
wb.Put(k, v2);
}
s = db->Write(write_options, &wb);
ASSERT_OK(s);
} break;
case 2:
case 3:
txn = db->BeginTransaction(write_options, txn_options);
s = txn->SetName("xid" + istr);
ASSERT_OK(s);
committed_kvs[k] = v;
s = txn->Put(k, v);
ASSERT_OK(s);
// TODO(myabandeh): remove this when we supprot duplicate keys in
// db->Write method
if (false) {
committed_kvs[k] = v2;
s = txn->Put(k, v2);
}
ASSERT_OK(s);
if (type == 3) {
s = txn->Prepare();
ASSERT_OK(s);
}
s = txn->Commit();
ASSERT_OK(s);
if (type == 2) {
auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
// TODO(myabandeh): this is counter-intuitive. The destructor should
// also do the unregistering.
pdb->UnregisterTransaction(txn);
}
delete txn;
break;
default:
assert(0);
}
index++;
} // for i
txn_db_options.write_policy = to_policy;
auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
// Before upgrade/downgrade the WAL must be emptied
if (empty_wal) {
db_impl->TEST_FlushMemTable();
} else {
db_impl->FlushWAL(true);
}
auto s = ReOpenNoDelete();
if (empty_wal) {
ASSERT_OK(s);
} else {
// Test that we can detect the WAL that is produced by an incompatbile
// WritePolicy and fail fast before mis-interpreting the WAL.
ASSERT_TRUE(s.IsNotSupported());
return;
}
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
// Check that WAL is empty
VectorLogPtr log_files;
db_impl->GetSortedWalFiles(log_files);
ASSERT_EQ(0, log_files.size());
for (auto& kv : committed_kvs) {
std::string value;
s = db->Get(read_options, kv.first, &value);
if (s.IsNotFound()) {
printf("key = %s\n", kv.first.c_str());
}
ASSERT_OK(s);
if (kv.second != value) {
printf("key = %s\n", kv.first.c_str());
}
ASSERT_EQ(kv.second, value);
}
}
}; };
class MySQLStyleTransactionTest : public TransactionTest {}; class MySQLStyleTransactionTest : public TransactionTest {};

@ -1629,6 +1629,34 @@ TEST_P(WritePreparedTransactionTest, Iterate) {
delete transaction; delete transaction;
} }
// Test that we can change write policy from WriteCommitted to WritePrepared
// after a clean shutdown (which would empty the WAL)
TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, empty_wal);
}
// Test that we fail fast if WAL is not emptied between changing the write
// policy from WriteCommitted to WritePrepared
TEST_P(WritePreparedTransactionTest, WP_WC_WALBackwardIncompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, !empty_wal);
}
// Test that we can change write policy from WritePrepare back to WriteCommitted
// after a clean shutdown (which would empty the WAL)
TEST_P(WritePreparedTransactionTest, WC_WP_ForwardCompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, empty_wal);
}
// Test that we fail fast if WAL is not emptied between changing the write
// policy from WriteCommitted to WritePrepared
TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, !empty_wal);
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -61,7 +61,9 @@ Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
Status WritePreparedTxn::PrepareInternal() { Status WritePreparedTxn::PrepareInternal() {
WriteOptions write_options = write_options_; WriteOptions write_options = write_options_;
write_options.disableWAL = false; write_options.disableWAL = false;
WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_); const bool write_after_commit = true;
WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
!write_after_commit);
const bool disable_memtable = true; const bool disable_memtable = true;
uint64_t seq_used = kMaxSequenceNumber; uint64_t seq_used = kMaxSequenceNumber;
bool collapsed = GetWriteBatch()->Collapse(); bool collapsed = GetWriteBatch()->Collapse();
@ -216,6 +218,9 @@ Status WritePreparedTxn::RollbackInternal() {
Status MarkRollback(const Slice&) override { Status MarkRollback(const Slice&) override {
return Status::InvalidArgument(); return Status::InvalidArgument();
} }
protected:
virtual bool WriteAfterCommit() const override { return false; }
} rollback_handler(db_impl_, wpt_db_, last_visible_txn, &rollback_batch); } rollback_handler(db_impl_, wpt_db_, last_visible_txn, &rollback_batch);
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler); auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
assert(s.ok()); assert(s.ok());

@ -20,6 +20,7 @@
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "util/arena.h" #include "util/arena.h"
#include "util/cast_util.h" #include "util/cast_util.h"
#include "util/string_util.h"
#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
namespace rocksdb { namespace rocksdb {
@ -552,13 +553,15 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
break; break;
case kTypeLogData: case kTypeLogData:
case kTypeBeginPrepareXID: case kTypeBeginPrepareXID:
case kTypeBeginPersistedPrepareXID:
case kTypeEndPrepareXID: case kTypeEndPrepareXID:
case kTypeCommitXID: case kTypeCommitXID:
case kTypeRollbackXID: case kTypeRollbackXID:
case kTypeNoop: case kTypeNoop:
break; break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag in ReBuildIndex",
ToString(static_cast<unsigned int>(tag)));
} }
} }
@ -622,6 +625,7 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
break; break;
case kTypeLogData: case kTypeLogData:
case kTypeBeginPrepareXID: case kTypeBeginPrepareXID:
case kTypeBeginPersistedPrepareXID:
case kTypeEndPrepareXID: case kTypeEndPrepareXID:
case kTypeCommitXID: case kTypeCommitXID:
case kTypeRollbackXID: case kTypeRollbackXID:

@ -71,13 +71,15 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
break; break;
case kTypeNoop: case kTypeNoop:
case kTypeBeginPrepareXID: case kTypeBeginPrepareXID:
case kTypeBeginPersistedPrepareXID:
case kTypeEndPrepareXID: case kTypeEndPrepareXID:
case kTypeCommitXID: case kTypeCommitXID:
case kTypeRollbackXID: case kTypeRollbackXID:
*type = kXIDRecord; *type = kXIDRecord;
break; break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag ",
ToString(static_cast<unsigned int>(tag)));
} }
return Status::OK(); return Status::OK();
} }

Loading…
Cancel
Save