|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
#include "utilities/transactions/transaction_test.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <functional>
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
#include <string>
|
|
|
|
#include <thread>
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
|
|
#include "port/port.h"
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/perf_context.h"
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
#include "rocksdb/utilities/transaction.h"
|
|
|
|
#include "rocksdb/utilities/transaction_db.h"
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
#include "table/mock_table.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
|
|
|
#include "test_util/transaction_test_util.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
#include "utilities/fault_injection_env.h"
|
|
|
|
#include "utilities/merge_operators.h"
|
|
|
|
#include "utilities/merge_operators/string_append/stringappend.h"
|
|
|
|
#include "utilities/transactions/pessimistic_transaction_db.h"
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
DBAsBaseDB, TransactionTest,
|
|
|
|
::testing::Values(
|
|
|
|
std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
|
|
|
|
std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
DBAsBaseDB, TransactionStressTest,
|
|
|
|
::testing::Values(
|
|
|
|
std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
|
|
|
|
std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
|
|
|
|
std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
StackableDBAsBaseDB, TransactionTest,
|
|
|
|
::testing::Values(
|
|
|
|
std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
|
|
|
|
std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
|
|
|
|
std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
|
|
|
|
|
|
|
|
// MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it
|
|
|
|
// in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set).
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
MySQLStyleTransactionTest, MySQLStyleTransactionTest,
|
|
|
|
::testing::Values(
|
|
|
|
std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
|
|
|
|
std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
|
|
|
|
std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
|
|
|
|
std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
|
|
|
|
std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
|
|
|
|
std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
|
|
|
|
std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
|
|
|
|
std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
|
|
|
|
std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, DoubleEmptyWrite) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = false;
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Write(write_options, &batch));
|
|
|
|
ASSERT_OK(db->Write(write_options, &batch));
|
|
|
|
|
|
|
|
// Also test committing empty transactions in 2PC
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
|
|
|
|
// Also test that it works during recovery
|
|
|
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid2"));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
delete txn0;
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
assert(db != nullptr);
|
|
|
|
txn0 = db->GetTransactionByName("xid2");
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SuccessTest) {
|
|
|
|
ASSERT_OK(db->ResetStats());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
ASSERT_EQ(0, txn->GetNumPuts());
|
|
|
|
ASSERT_LE(0, txn->GetID());
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_EQ(1, txn->GetNumPuts());
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
ASSERT_OK(txn->Commit());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_OK(db->Get(read_options, "foo", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
Fix a silent data loss for write-committed txn (#9571)
Summary:
The following sequence of events can cause silent data loss for write-committed
transactions.
```
Time thread 1 bg flush
| db->Put("a")
| txn = NewTxn()
| txn->Put("b", "v")
| txn->Prepare() // writes only to 5.log
| db->SwitchMemtable() // memtable 1 has "a"
| // close 5.log,
| // creates 8.log
| trigger flush
| pick memtable 1
| unlock db mutex
| write new sst
| txn->ctwb->Put("gtid", "1") // writes 8.log
| txn->Commit() // writes to 8.log
| // writes to memtable 2
| compute min_log_number_to_keep_2pc, this
| will be 8 (incorrect).
|
| Purge obsolete wals, including 5.log
|
V
```
At this point, writes of txn exists only in memtable. Close db without flush because db thinks the data in
memtable are backed by log. Then reopen, the writes are lost except key-value pair {"gtid"->"1"},
only the commit marker of txn is in 8.log
The reason lies in `PrecomputeMinLogNumberToKeep2PC()` which calls `FindMinPrepLogReferencedByMemTable()`.
In the above example, when bg flush thread tries to find obsolete wals, it uses the information
computed by `PrecomputeMinLogNumberToKeep2PC()`. The return value of `PrecomputeMinLogNumberToKeep2PC()`
depends on three components
- `PrecomputeMinLogNumberToKeepNon2PC()`. This represents the WAL that has unflushed data. As the name of this method suggests, it does not account for 2PC. Although the keys reside in the prepare section of a previous WAL, the column family references the current WAL when they are actually inserted into the memtable during txn commit.
- `prep_tracker->FindMinLogContainingOutstandingPrep()`. This represents the WAL with a prepare section but the txn hasn't committed.
- `FindMinPrepLogReferencedByMemTable()`. This represents the WAL on which some memtables (mutable and immutable) depend for their unflushed data.
The bug lies in `FindMinPrepLogReferencedByMemTable()`. Originally, this function skips checking the column families
that are being flushed, but the unit test added in this PR shows that they should not be. In this unit test, there is
only the default column family, and one of its memtables has unflushed data backed by a prepare section in 5.log.
We should return this information via `FindMinPrepLogReferencedByMemTable()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9571
Test Plan:
```
./transaction_test --gtest_filter=*/TransactionTest.SwitchMemtableDuringPrepareAndCommit_WC/*
make check
```
Reviewed By: siying
Differential Revision: D34235236
Pulled By: riversand963
fbshipit-source-id: 120eb21a666728a38dda77b96276c6af72b008b1
3 years ago
|
|
|
TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) {
|
|
|
|
const TxnDBWritePolicy write_policy = std::get<2>(GetParam());
|
|
|
|
|
|
|
|
if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) {
|
|
|
|
ROCKSDB_GTEST_BYPASS("Test applies to write-committed only");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(WriteOptions(), "key0", "value"));
|
|
|
|
|
|
|
|
TransactionOptions txn_opts;
|
|
|
|
txn_opts.use_only_the_last_commit_time_batch_for_recovery = true;
|
|
|
|
Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts);
|
|
|
|
assert(txn);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FlushJob::WriteLevel0Table", [&](void* arg) {
|
|
|
|
// db mutex not held.
|
|
|
|
auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
|
|
|
|
assert(mems);
|
|
|
|
ASSERT_EQ(1, mems->size());
|
|
|
|
auto* ctwb = txn->GetCommitTimeWriteBatch();
|
|
|
|
ASSERT_OK(ctwb->Put("gtid", "123"));
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
delete txn;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ASSERT_OK(txn->Put("key1", "value"));
|
|
|
|
ASSERT_OK(txn->SetName("txn1"));
|
|
|
|
|
|
|
|
ASSERT_OK(txn->Prepare());
|
|
|
|
|
|
|
|
auto dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr));
|
|
|
|
ASSERT_OK(dbimpl->TEST_FlushMemTable(
|
|
|
|
/*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr));
|
|
|
|
|
|
|
|
ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
{
|
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
|
|
|
|
ASSERT_EQ("value", value);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
Status s;
|
|
|
|
if (use_stackable_db_ == false) {
|
|
|
|
s = TransactionDB::Open(options, txn_db_options, dbname, &db);
|
|
|
|
} else {
|
|
|
|
s = OpenWithStackableDB();
|
|
|
|
}
|
|
|
|
ASSERT_OK(s);
|
|
|
|
assert(db);
|
|
|
|
|
|
|
|
{
|
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(db->Get(ReadOptions(), "gtid", &value));
|
|
|
|
ASSERT_EQ("123", value);
|
|
|
|
|
|
|
|
ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
|
|
|
|
ASSERT_EQ("value", value);
|
|
|
|
}
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
// The test clarifies the contract of do_validate and assume_tracked
|
|
|
|
// in GetForUpdate and Put/Merge/Delete
|
|
|
|
TEST_P(TransactionTest, AssumeExclusiveTracked) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.lock_timeout = 1;
|
|
|
|
const bool EXCLUSIVE = true;
|
|
|
|
const bool DO_VALIDATE = true;
|
|
|
|
const bool ASSUME_LOCKED = true;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
// commit a value after the snapshot is taken
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
|
|
|
|
// By default write should fail to the commit after our snapshot
|
|
|
|
s = txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE);
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
// But the user could direct the db to skip validating the snapshot. The read
|
|
|
|
// value then should be the most recently committed
|
|
|
|
ASSERT_OK(
|
|
|
|
txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE, !DO_VALIDATE));
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// Although ValidateSnapshot is skipped the key must have still got locked
|
|
|
|
s = db->Put(write_options, Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
// By default the write operations should fail due to the commit after the
|
|
|
|
// snapshot
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar1"));
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
s = txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
|
|
|
|
!ASSUME_LOCKED);
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
// But the user could direct the db that it already assumes exclusive lock on
|
|
|
|
// the key due to the previous GetForUpdate call.
|
|
|
|
ASSERT_OK(txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
|
|
|
|
ASSUME_LOCKED));
|
|
|
|
ASSERT_OK(txn->Merge(db->DefaultColumnFamily(), Slice("foo"), Slice("bar2"),
|
|
|
|
ASSUME_LOCKED));
|
|
|
|
ASSERT_OK(
|
|
|
|
txn->Delete(db->DefaultColumnFamily(), Slice("foo"), ASSUME_LOCKED));
|
|
|
|
ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"),
|
|
|
|
ASSUME_LOCKED));
|
|
|
|
|
|
|
|
ASSERT_OK(txn->Rollback());
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
// This test clarifies the contract of ValidateSnapshot
|
|
|
|
TEST_P(TransactionTest, ValidateSnapshotTest) {
|
|
|
|
for (bool with_flush : {true}) {
|
|
|
|
for (bool with_2pc : {true}) {
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
|
|
|
|
assert(db != nullptr);
|
|
|
|
Transaction* txn1 =
|
|
|
|
db->BeginTransaction(write_options, TransactionOptions());
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
ASSERT_OK(txn1->Put(Slice("foo"), Slice("bar1")));
|
|
|
|
if (with_2pc) {
|
|
|
|
ASSERT_OK(txn1->SetName("xid1"));
|
|
|
|
ASSERT_OK(txn1->Prepare());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (with_flush) {
|
|
|
|
auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
// Make sure the flushed memtable is not kept in memory
|
|
|
|
int max_memtable_in_history =
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
std::max(
|
|
|
|
options.max_write_buffer_number,
|
|
|
|
static_cast<int>(options.max_write_buffer_size_to_maintain) /
|
|
|
|
static_cast<int>(options.write_buffer_size)) +
|
|
|
|
1;
|
|
|
|
for (int i = 0; i < max_memtable_in_history; i++) {
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("key"), Slice("value")));
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Transaction* txn2 =
|
|
|
|
db->BeginTransaction(write_options, TransactionOptions());
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
txn2->SetSnapshot();
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Commit());
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
auto pes_txn2 = dynamic_cast<PessimisticTransaction*>(txn2);
|
|
|
|
// Test the simple case where the key is not tracked yet
|
|
|
|
auto trakced_seq = kMaxSequenceNumber;
|
|
|
|
auto s = pes_txn2->ValidateSnapshot(db->DefaultColumnFamily(), "foo",
|
|
|
|
&trakced_seq);
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, WaitingTxn) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1;
|
|
|
|
s = db->Put(write_options, Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
/* create second cf */
|
|
|
|
ColumnFamilyHandle* cfa;
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Put(write_options, cfa, Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
TransactionID id1 = txn1->GetID();
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
|
|
|
|
std::string key;
|
|
|
|
uint32_t cf_id;
|
|
|
|
std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
|
|
|
|
ASSERT_EQ(key, "foo");
|
|
|
|
ASSERT_EQ(wait.size(), 1);
|
|
|
|
ASSERT_EQ(wait[0], id1);
|
|
|
|
ASSERT_EQ(cf_id, 0U);
|
|
|
|
});
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
// lock key in default cf
|
|
|
|
s = txn1->GetForUpdate(read_options, "foo", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
|
|
|
|
|
|
|
|
// lock key in cfa
|
|
|
|
s = txn1->GetForUpdate(read_options, cfa, "foo", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
|
|
|
|
|
|
|
|
auto lock_data = db->GetLockStatusData();
|
|
|
|
// Locked keys exist in both column family.
|
|
|
|
ASSERT_EQ(lock_data.size(), 2);
|
|
|
|
|
|
|
|
auto cf_iterator = lock_data.begin();
|
|
|
|
|
|
|
|
// The iterator points to an unordered_multimap
|
|
|
|
// thus the test can not assume any particular order.
|
|
|
|
|
|
|
|
// Column family is 1 or 0 (cfa).
|
|
|
|
if (cf_iterator->first != 1 && cf_iterator->first != 0) {
|
|
|
|
FAIL();
|
|
|
|
}
|
|
|
|
// The locked key is "foo" and is locked by txn1
|
|
|
|
ASSERT_EQ(cf_iterator->second.key, "foo");
|
|
|
|
ASSERT_EQ(cf_iterator->second.ids.size(), 1);
|
|
|
|
ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID());
|
|
|
|
|
|
|
|
cf_iterator++;
|
|
|
|
|
|
|
|
// Column family is 0 (default) or 1.
|
|
|
|
if (cf_iterator->first != 1 && cf_iterator->first != 0) {
|
|
|
|
FAIL();
|
|
|
|
}
|
|
|
|
// The locked key is "foo" and is locked by txn1
|
|
|
|
ASSERT_EQ(cf_iterator->second.key, "foo");
|
|
|
|
ASSERT_EQ(cf_iterator->second.ids.size(), 1);
|
|
|
|
ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID());
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", &value);
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
|
|
|
ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1);
|
|
|
|
ASSERT_GE(get_perf_context()->key_lock_wait_time, 0);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
delete cfa;
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SharedLocks) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1;
|
|
|
|
s = db->Put(write_options, Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
ASSERT_TRUE(txn3);
|
|
|
|
|
|
|
|
// Test shared access between txns
|
|
|
|
s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn3->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
auto lock_data = db->GetLockStatusData();
|
|
|
|
ASSERT_EQ(lock_data.size(), 1);
|
|
|
|
|
|
|
|
auto cf_iterator = lock_data.begin();
|
|
|
|
ASSERT_EQ(cf_iterator->second.key, "foo");
|
|
|
|
|
|
|
|
// We compare whether the set of txns locking this key is the same. To do
|
|
|
|
// this, we need to sort both vectors so that the comparison is done
|
|
|
|
// correctly.
|
|
|
|
std::vector<TransactionID> expected_txns = {txn1->GetID(), txn2->GetID(),
|
|
|
|
txn3->GetID()};
|
|
|
|
std::vector<TransactionID> lock_txns = cf_iterator->second.ids;
|
|
|
|
ASSERT_EQ(expected_txns, lock_txns);
|
|
|
|
ASSERT_FALSE(cf_iterator->second.exclusive);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
|
|
|
ASSERT_OK(txn2->Rollback());
|
|
|
|
ASSERT_OK(txn3->Rollback());
|
|
|
|
|
|
|
|
// Test txn1 and txn2 sharing a lock and txn3 trying to obtain it.
|
|
|
|
s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn3->GetForUpdate(read_options, "foo", nullptr);
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("foo");
|
|
|
|
s = txn3->GetForUpdate(read_options, "foo", nullptr);
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
|
|
|
|
|
|
|
txn2->UndoGetForUpdate("foo");
|
|
|
|
s = txn3->GetForUpdate(read_options, "foo", nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
|
|
|
ASSERT_OK(txn2->Rollback());
|
|
|
|
ASSERT_OK(txn3->Rollback());
|
|
|
|
|
|
|
|
// Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock.
|
|
|
|
s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr);
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("foo");
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
|
|
|
ASSERT_OK(txn2->Rollback());
|
|
|
|
|
|
|
|
// Test txn1 trying to downgrade its lock.
|
|
|
|
s = txn1->GetForUpdate(read_options, "foo", nullptr, true /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
|
|
|
|
|
|
|
// Should still fail after "downgrading".
|
|
|
|
s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
|
|
|
ASSERT_OK(txn2->Rollback());
|
|
|
|
|
|
|
|
// Test txn1 holding an exclusive lock and txn2 trying to obtain shared
|
|
|
|
// access.
|
|
|
|
s = txn1->GetForUpdate(read_options, "foo", nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("foo");
|
|
|
|
s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
delete txn3;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, DeadlockCycleShared) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1000000;
|
|
|
|
txn_options.deadlock_detect = true;
|
|
|
|
|
|
|
|
// Set up a wait for chain like this:
|
|
|
|
//
|
|
|
|
// Tn -> T(n*2)
|
|
|
|
// Tn -> T(n*2 + 1)
|
|
|
|
//
|
|
|
|
// So we have:
|
|
|
|
// T1 -> T2 -> T4 ...
|
|
|
|
// | |> T5 ...
|
|
|
|
// |> T3 -> T6 ...
|
|
|
|
// |> T7 ...
|
|
|
|
// up to T31, then T[16 - 31] -> T1.
|
|
|
|
// Note that Tn holds lock on floor(n / 2).
|
|
|
|
|
|
|
|
std::vector<Transaction*> txns(31);
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < 31; i++) {
|
|
|
|
txns[i] = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txns[i]);
|
|
|
|
auto s = txns[i]->GetForUpdate(read_options, std::to_string((i + 1) / 2),
|
|
|
|
nullptr, false /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::atomic<uint32_t> checkpoints(0);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PointLockManager::AcquireWithTimeout:WaitingTxn",
|
|
|
|
[&](void* /*arg*/) { checkpoints.fetch_add(1); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// We want the leaf transactions to block and hold everyone back.
|
|
|
|
std::vector<port::Thread> threads;
|
|
|
|
for (uint32_t i = 0; i < 15; i++) {
|
|
|
|
std::function<void()> blocking_thread = [&, i] {
|
|
|
|
auto s = txns[i]->GetForUpdate(read_options, std::to_string(i + 1),
|
|
|
|
nullptr, true /* exclusive */);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(txns[i]->Rollback());
|
|
|
|
delete txns[i];
|
|
|
|
};
|
|
|
|
threads.emplace_back(blocking_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait until all threads are waiting on each other.
|
|
|
|
while (checkpoints.load() != 15) {
|
|
|
|
/* sleep override */
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
}
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
// Complete the cycle T[16 - 31] -> T1
|
|
|
|
for (uint32_t i = 15; i < 31; i++) {
|
|
|
|
auto s =
|
|
|
|
txns[i]->GetForUpdate(read_options, "0", nullptr, true /* exclusive */);
|
|
|
|
ASSERT_TRUE(s.IsDeadlock());
|
|
|
|
|
|
|
|
// Calculate next buffer len, plateau at 5 when 5 records are inserted.
|
|
|
|
const uint32_t curr_dlock_buffer_len_ =
|
|
|
|
(i - 14 > kInitialMaxDeadlocks) ? kInitialMaxDeadlocks : (i - 14);
|
|
|
|
|
|
|
|
auto dlock_buffer = db->GetDeadlockInfoBuffer();
|
|
|
|
ASSERT_EQ(dlock_buffer.size(), curr_dlock_buffer_len_);
|
|
|
|
auto dlock_entry = dlock_buffer[0].path;
|
|
|
|
ASSERT_EQ(dlock_entry.size(), kInitialMaxDeadlocks);
|
|
|
|
int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
|
|
|
|
int64_t cur_deadlock_time = 0;
|
|
|
|
for (auto const& dl_path_rec : dlock_buffer) {
|
|
|
|
cur_deadlock_time = dl_path_rec.deadlock_time;
|
|
|
|
ASSERT_NE(cur_deadlock_time, 0);
|
|
|
|
ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
|
|
|
|
pre_deadlock_time = cur_deadlock_time;
|
|
|
|
}
|
|
|
|
|
|
|
|
int64_t curr_waiting_key = 0;
|
|
|
|
|
|
|
|
// Offset of each txn id from the root of the shared dlock tree's txn id.
|
|
|
|
int64_t offset_root = dlock_entry[0].m_txn_id - 1;
|
|
|
|
// Offset of the final entry in the dlock path from the root's txn id.
|
|
|
|
TransactionID leaf_id =
|
|
|
|
dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root;
|
|
|
|
|
|
|
|
for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
|
|
|
|
auto dl_node = *it;
|
|
|
|
ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id);
|
|
|
|
ASSERT_EQ(dl_node.m_cf_id, 0U);
|
|
|
|
ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key));
|
|
|
|
ASSERT_EQ(dl_node.m_exclusive, true);
|
|
|
|
|
|
|
|
if (curr_waiting_key == 0) {
|
|
|
|
curr_waiting_key = leaf_id;
|
|
|
|
}
|
|
|
|
curr_waiting_key /= 2;
|
|
|
|
leaf_id /= 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Rollback the leaf transaction.
|
|
|
|
for (uint32_t i = 15; i < 31; i++) {
|
|
|
|
ASSERT_OK(txns[i]->Rollback());
|
|
|
|
delete txns[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Downsize the buffer and verify the 3 latest deadlocks are preserved.
|
|
|
|
auto dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
db->SetDeadlockInfoBufferSize(3);
|
|
|
|
auto dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < dlock_buffer_after_resize.size(); i++) {
|
|
|
|
for (uint32_t j = 0; j < dlock_buffer_after_resize[i].path.size(); j++) {
|
|
|
|
ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
|
|
|
|
dlock_buffer_before_resize[i].path[j].m_txn_id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Upsize the buffer and verify the 3 latest dealocks are preserved.
|
|
|
|
dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
db->SetDeadlockInfoBufferSize(5);
|
|
|
|
dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < dlock_buffer_before_resize.size(); i++) {
|
|
|
|
for (uint32_t j = 0; j < dlock_buffer_before_resize[i].path.size(); j++) {
|
|
|
|
ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
|
|
|
|
dlock_buffer_before_resize[i].path[j].m_txn_id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Downsize to 0 and verify the size is consistent.
|
|
|
|
dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
db->SetDeadlockInfoBufferSize(0);
|
|
|
|
dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
|
|
|
|
|
|
|
|
// Upsize from 0 to verify the size is persistent.
|
|
|
|
dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
db->SetDeadlockInfoBufferSize(3);
|
|
|
|
dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
|
|
|
|
ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
|
|
|
|
|
|
|
|
// Contrived case of shared lock of cycle size 2 to verify that a shared
|
|
|
|
// lock causing a deadlock is correctly reported as "shared" in the buffer.
|
|
|
|
std::vector<Transaction*> txns_shared(2);
|
|
|
|
|
|
|
|
// Create a cycle of size 2.
|
|
|
|
for (uint32_t i = 0; i < 2; i++) {
|
|
|
|
txns_shared[i] = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txns_shared[i]);
|
|
|
|
auto s =
|
|
|
|
txns_shared[i]->GetForUpdate(read_options, std::to_string(i), nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::atomic<uint32_t> checkpoints_shared(0);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PointLockManager::AcquireWithTimeout:WaitingTxn",
|
|
|
|
[&](void* /*arg*/) { checkpoints_shared.fetch_add(1); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
std::vector<port::Thread> threads_shared;
|
|
|
|
for (uint32_t i = 0; i < 1; i++) {
|
|
|
|
std::function<void()> blocking_thread = [&, i] {
|
|
|
|
auto s = txns_shared[i]->GetForUpdate(read_options, std::to_string(i + 1),
|
|
|
|
nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(txns_shared[i]->Rollback());
|
|
|
|
delete txns_shared[i];
|
|
|
|
};
|
|
|
|
threads_shared.emplace_back(blocking_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait until all threads are waiting on each other.
|
|
|
|
while (checkpoints_shared.load() != 1) {
|
|
|
|
/* sleep override */
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
}
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
// Complete the cycle T2 -> T1 with a shared lock.
|
|
|
|
auto s = txns_shared[1]->GetForUpdate(read_options, "0", nullptr, false);
|
|
|
|
ASSERT_TRUE(s.IsDeadlock());
|
|
|
|
|
|
|
|
auto dlock_buffer = db->GetDeadlockInfoBuffer();
|
|
|
|
|
|
|
|
// Verify the size of the buffer and the single path.
|
|
|
|
ASSERT_EQ(dlock_buffer.size(), 1);
|
|
|
|
ASSERT_EQ(dlock_buffer[0].path.size(), 2);
|
|
|
|
|
|
|
|
// Verify the exclusivity field of the transactions in the deadlock path.
|
|
|
|
ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive);
|
|
|
|
ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive);
|
|
|
|
ASSERT_OK(txns_shared[1]->Rollback());
|
|
|
|
delete txns_shared[1];
|
|
|
|
|
|
|
|
for (auto& t : threads_shared) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
TEST_P(TransactionStressTest, DeadlockCycle) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
// offset by 2 from the max depth to test edge case
|
|
|
|
const uint32_t kMaxCycleLength = 52;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1000000;
|
|
|
|
txn_options.deadlock_detect = true;
|
|
|
|
|
|
|
|
for (uint32_t len = 2; len < kMaxCycleLength; len++) {
|
|
|
|
// Set up a long wait for chain like this:
|
|
|
|
//
|
|
|
|
// T1 -> T2 -> T3 -> ... -> Tlen
|
|
|
|
|
|
|
|
std::vector<Transaction*> txns(len);
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < len; i++) {
|
|
|
|
txns[i] = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txns[i]);
|
|
|
|
auto s = txns[i]->GetForUpdate(read_options, std::to_string(i), nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::atomic<uint32_t> checkpoints(0);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PointLockManager::AcquireWithTimeout:WaitingTxn",
|
|
|
|
[&](void* /*arg*/) { checkpoints.fetch_add(1); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// We want the last transaction in the chain to block and hold everyone
|
|
|
|
// back.
|
|
|
|
std::vector<port::Thread> threads;
|
|
|
|
for (uint32_t i = 0; i + 1 < len; i++) {
|
|
|
|
std::function<void()> blocking_thread = [&, i] {
|
|
|
|
auto s =
|
|
|
|
txns[i]->GetForUpdate(read_options, std::to_string(i + 1), nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(txns[i]->Rollback());
|
|
|
|
delete txns[i];
|
|
|
|
};
|
|
|
|
threads.emplace_back(blocking_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait until all threads are waiting on each other.
|
|
|
|
while (checkpoints.load() != len - 1) {
|
|
|
|
/* sleep override */
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
}
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
// Complete the cycle Tlen -> T1
|
|
|
|
auto s = txns[len - 1]->GetForUpdate(read_options, "0", nullptr);
|
|
|
|
ASSERT_TRUE(s.IsDeadlock());
|
|
|
|
|
|
|
|
const uint32_t dlock_buffer_size_ = (len - 1 > 5) ? 5 : (len - 1);
|
|
|
|
uint32_t curr_waiting_key = 0;
|
|
|
|
TransactionID curr_txn_id = txns[0]->GetID();
|
|
|
|
|
|
|
|
auto dlock_buffer = db->GetDeadlockInfoBuffer();
|
|
|
|
ASSERT_EQ(dlock_buffer.size(), dlock_buffer_size_);
|
|
|
|
uint32_t check_len = len;
|
|
|
|
bool check_limit_flag = false;
|
|
|
|
|
|
|
|
// Special case for a deadlock path that exceeds the maximum depth.
|
|
|
|
if (len > 50) {
|
|
|
|
check_len = 0;
|
|
|
|
check_limit_flag = true;
|
|
|
|
}
|
|
|
|
auto dlock_entry = dlock_buffer[0].path;
|
|
|
|
ASSERT_EQ(dlock_entry.size(), check_len);
|
|
|
|
ASSERT_EQ(dlock_buffer[0].limit_exceeded, check_limit_flag);
|
|
|
|
|
|
|
|
int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
|
|
|
|
int64_t cur_deadlock_time = 0;
|
|
|
|
for (auto const& dl_path_rec : dlock_buffer) {
|
|
|
|
cur_deadlock_time = dl_path_rec.deadlock_time;
|
|
|
|
ASSERT_NE(cur_deadlock_time, 0);
|
|
|
|
ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
|
|
|
|
pre_deadlock_time = cur_deadlock_time;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Iterates backwards over path verifying decreasing txn_ids.
|
|
|
|
for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
|
|
|
|
auto dl_node = *it;
|
|
|
|
ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1);
|
|
|
|
ASSERT_EQ(dl_node.m_cf_id, 0u);
|
|
|
|
ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key));
|
|
|
|
ASSERT_EQ(dl_node.m_exclusive, true);
|
|
|
|
|
|
|
|
curr_txn_id--;
|
|
|
|
if (curr_waiting_key == 0) {
|
|
|
|
curr_waiting_key = len;
|
|
|
|
}
|
|
|
|
curr_waiting_key--;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Rollback the last transaction.
|
|
|
|
ASSERT_OK(txns[len - 1]->Rollback());
|
|
|
|
delete txns[len - 1];
|
|
|
|
|
|
|
|
for (auto& t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionStressTest, DeadlockStress) {
|
|
|
|
const uint32_t NUM_TXN_THREADS = 10;
|
|
|
|
const uint32_t NUM_KEYS = 100;
|
|
|
|
const uint32_t NUM_ITERS = 1000;
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1000000;
|
|
|
|
txn_options.deadlock_detect = true;
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < NUM_KEYS; i++) {
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice(std::to_string(i)), Slice("")));
|
|
|
|
keys.push_back(std::to_string(i));
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
|
|
|
|
Random rnd(static_cast<uint32_t>(tid));
|
|
|
|
std::function<void(uint32_t)> stress_thread = [&](uint32_t seed) {
|
|
|
|
std::default_random_engine g(seed);
|
|
|
|
|
|
|
|
Transaction* txn;
|
|
|
|
for (uint32_t i = 0; i < NUM_ITERS; i++) {
|
|
|
|
txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
auto random_keys = keys;
|
|
|
|
std::shuffle(random_keys.begin(), random_keys.end(), g);
|
|
|
|
|
|
|
|
// Lock keys in random order.
|
|
|
|
for (const auto& k : random_keys) {
|
|
|
|
// Lock mostly for shared access, but exclusive 1/4 of the time.
|
|
|
|
auto s =
|
|
|
|
txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0);
|
|
|
|
if (!s.ok()) {
|
|
|
|
ASSERT_TRUE(s.IsDeadlock());
|
|
|
|
ASSERT_OK(txn->Rollback());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<port::Thread> threads;
|
|
|
|
for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) {
|
|
|
|
threads.emplace_back(stress_thread, rnd.Next());
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, CommitTimeBatchFailTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->GetCommitTimeWriteBatch()->Put("cat", "dog"));
|
|
|
|
|
|
|
|
s = txn1->Put("foo", "bar");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// fails due to non-empty commit-time batch
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, LogMarkLeakTest) {
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
WriteOptions write_options;
|
|
|
|
options.write_buffer_size = 1024;
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
assert(db != nullptr);
|
|
|
|
Random rnd(47);
|
|
|
|
std::vector<Transaction*> txns;
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
// At the beginning there should be no log containing prepare data
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
for (size_t i = 0; i < 100; i++) {
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
|
|
|
|
ASSERT_OK(txn->Put(Slice("foo" + std::to_string(i)), Slice("bar")));
|
|
|
|
ASSERT_OK(txn->Prepare());
|
|
|
|
ASSERT_GT(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
if (rnd.OneIn(5)) {
|
|
|
|
txns.push_back(txn);
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
}
|
|
|
|
for (auto txn : txns) {
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
// At the end there should be no log left containing prepare data
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
// Make sure that the underlying data structures are properly truncated and
|
|
|
|
// cause not leak
|
|
|
|
ASSERT_EQ(db_impl->TEST_PreparedSectionCompletedSize(), 0);
|
|
|
|
ASSERT_EQ(db_impl->TEST_LogsWithPrepSize(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) {
|
|
|
|
for (bool cwb4recovery : {true, false}) {
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.use_only_the_last_commit_time_batch_for_recovery = cwb4recovery;
|
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn->SetName("xid");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(db->GetTransactionByName("xid"), txn);
|
|
|
|
|
|
|
|
// transaction put
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, txn->GetNumPuts());
|
|
|
|
|
|
|
|
// regular db put
|
|
|
|
s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, txn->GetNumPuts());
|
|
|
|
|
|
|
|
// regular db read
|
|
|
|
ASSERT_OK(db->Get(read_options, "foo2", &value));
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
// commit time put
|
|
|
|
if (cwb4recovery) {
|
|
|
|
ASSERT_OK(
|
|
|
|
txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs")));
|
|
|
|
ASSERT_OK(
|
|
|
|
txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats")));
|
|
|
|
}
|
|
|
|
|
|
|
|
// nothing has been prepped yet
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
|
|
|
|
s = txn->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// data not im mem yet
|
|
|
|
s = db->Get(read_options, Slice("foo"), &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = db->Get(read_options, Slice("gtid"), &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// find trans in list of prepared transactions
|
|
|
|
std::vector<Transaction*> prepared_trans;
|
|
|
|
db->GetAllPreparedTransactions(&prepared_trans);
|
|
|
|
ASSERT_EQ(prepared_trans.size(), 1);
|
|
|
|
ASSERT_EQ(prepared_trans.front()->GetName(), "xid");
|
|
|
|
|
|
|
|
auto log_containing_prep =
|
|
|
|
db_impl->TEST_FindMinLogContainingOutstandingPrep();
|
|
|
|
ASSERT_GT(log_containing_prep, 0);
|
|
|
|
|
|
|
|
// make commit
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// value is now available
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// we already committed
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// no longer is prepared results
|
|
|
|
db->GetAllPreparedTransactions(&prepared_trans);
|
|
|
|
ASSERT_EQ(prepared_trans.size(), 0);
|
|
|
|
ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
|
|
|
|
|
|
|
|
// heap should not care about prepared section anymore
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
// but now our memtable should be referencing the prep section
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
|
|
|
|
ASSERT_EQ(log_containing_prep,
|
|
|
|
db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// In these modes memtable do not ref the prep sections
|
|
|
|
ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
// After flush the recoverable state must be visible
|
|
|
|
if (cwb4recovery) {
|
|
|
|
s = db->Get(read_options, "gtid", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "dogs");
|
|
|
|
|
|
|
|
s = db->Get(read_options, "gtid2", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "cats");
|
|
|
|
}
|
|
|
|
|
|
|
|
// after memtable flush we can now relese the log
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
|
|
|
|
ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
if (cwb4recovery) {
|
|
|
|
// kill and reopen to trigger recovery
|
|
|
|
s = ReOpenNoDelete();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
assert(db != nullptr);
|
|
|
|
s = db->Get(read_options, "gtid", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "dogs");
|
|
|
|
|
|
|
|
s = db->Get(read_options, "gtid2", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "cats");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TwoPhaseNameTest) {
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn3);
|
|
|
|
delete txn3;
|
|
|
|
|
|
|
|
// cant prepare txn without name
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// name too short
|
|
|
|
s = txn1->SetName("");
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// name too long
|
|
|
|
s = txn1->SetName(std::string(513, 'x'));
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// valid set name
|
|
|
|
s = txn1->SetName("name1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// cant have duplicate name
|
|
|
|
s = txn2->SetName("name1");
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// shouldn't be able to prepare
|
|
|
|
s = txn2->Prepare();
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// valid name set
|
|
|
|
s = txn2->SetName("name2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// cant reset name
|
|
|
|
s = txn2->SetName("name3");
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
ASSERT_EQ(txn1->GetName(), "name1");
|
|
|
|
ASSERT_EQ(txn2->GetName(), "name2");
|
|
|
|
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// can't rename after prepare
|
|
|
|
s = txn1->SetName("name4");
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
|
|
|
ASSERT_OK(txn2->Rollback());
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) {
|
|
|
|
for (bool cwb4recovery : {true, false}) {
|
|
|
|
for (bool test_with_empty_wal : {true, false}) {
|
|
|
|
if (!cwb4recovery && test_with_empty_wal) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
Status s;
|
|
|
|
std::string value;
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.use_only_the_last_commit_time_batch_for_recovery =
|
|
|
|
cwb4recovery;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
s = txn1->SetName("joe");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->SetName("bob");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
if (cwb4recovery) {
|
|
|
|
ASSERT_OK(
|
|
|
|
txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar")));
|
|
|
|
}
|
|
|
|
|
|
|
|
s = txn2->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn2;
|
|
|
|
if (cwb4recovery) {
|
|
|
|
if (test_with_empty_wal) {
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
// After flush the state must be visible
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
}
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
// kill and reopen to trigger recovery
|
|
|
|
s = ReOpenNoDelete();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
assert(db != nullptr);
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
TEST_P(TransactionStressTest, TwoPhaseExpirationTest) {
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.expiration = 500; // 500ms
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
s = txn1->SetName("joe");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->SetName("bob");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
/* sleep override */
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Prepare();
|
|
|
|
ASSERT_EQ(s, Status::Expired());
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TwoPhaseRollbackTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn->SetName("xid");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// transaction put
|
|
|
|
s = txn->Put(Slice("tfoo"), Slice("tbar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// value is readable form txn
|
|
|
|
s = txn->Get(read_options, Slice("tfoo"), &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "tbar");
|
|
|
|
|
|
|
|
// issue rollback
|
|
|
|
s = txn->Rollback();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// value is nolonger readable
|
|
|
|
s = txn->Get(read_options, Slice("tfoo"), &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ(txn->GetNumPuts(), 0);
|
|
|
|
|
|
|
|
// put new txn values
|
|
|
|
s = txn->Put(Slice("tfoo2"), Slice("tbar2"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// new value is readable from txn
|
|
|
|
s = txn->Get(read_options, Slice("tfoo2"), &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "tbar2");
|
|
|
|
|
|
|
|
s = txn->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// flush to next wal
|
|
|
|
s = db->Put(write_options, Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
|
|
|
|
// issue rollback (marker written to WAL)
|
|
|
|
s = txn->Rollback();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// value is nolonger readable
|
|
|
|
s = txn->Get(read_options, Slice("tfoo2"), &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ(txn->GetNumPuts(), 0);
|
|
|
|
|
|
|
|
// make commit
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// try rollback again
|
|
|
|
s = txn->Rollback();
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = false;
|
|
|
|
ReadOptions read_options;
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn->SetName("xid");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(db->GetTransactionByName("xid"), txn);
|
|
|
|
|
|
|
|
// transaction put
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, txn->GetNumPuts());
|
|
|
|
|
|
|
|
// txn read
|
|
|
|
s = txn->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// regular db put
|
|
|
|
s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, txn->GetNumPuts());
|
|
|
|
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
|
|
|
|
// regular db read
|
|
|
|
db->Get(read_options, "foo2", &value);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
// nothing has been prepped yet
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
|
|
|
|
// prepare
|
|
|
|
s = txn->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// still not available to db
|
|
|
|
s = db->Get(read_options, Slice("foo"), &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
ASSERT_OK(db->FlushWAL(false));
|
|
|
|
delete txn;
|
|
|
|
// kill and reopen
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
s = ReOpenNoDelete();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
assert(db != nullptr);
|
|
|
|
db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
|
|
|
|
// find trans in list of prepared transactions
|
|
|
|
std::vector<Transaction*> prepared_trans;
|
|
|
|
db->GetAllPreparedTransactions(&prepared_trans);
|
|
|
|
ASSERT_EQ(prepared_trans.size(), 1);
|
|
|
|
|
|
|
|
txn = prepared_trans.front();
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
ASSERT_EQ(txn->GetName(), "xid");
|
|
|
|
ASSERT_EQ(db->GetTransactionByName("xid"), txn);
|
|
|
|
|
|
|
|
// log has been marked
|
|
|
|
auto log_containing_prep =
|
|
|
|
db_impl->TEST_FindMinLogContainingOutstandingPrep();
|
|
|
|
ASSERT_GT(log_containing_prep, 0);
|
|
|
|
|
|
|
|
// value is readable from txn
|
|
|
|
s = txn->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// make commit
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// value is now available
|
|
|
|
db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// we already committed
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
|
|
|
|
// no longer is prepared results
|
|
|
|
prepared_trans.clear();
|
|
|
|
db->GetAllPreparedTransactions(&prepared_trans);
|
|
|
|
ASSERT_EQ(prepared_trans.size(), 0);
|
|
|
|
|
|
|
|
// transaction should no longer be visible
|
|
|
|
ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
|
|
|
|
|
|
|
|
// heap should not care about prepared section anymore
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
// but now our memtable should be referencing the prep section
|
|
|
|
ASSERT_EQ(log_containing_prep,
|
|
|
|
db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
|
|
|
|
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// In these modes memtable do not ref the prep sections
|
|
|
|
ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
// Add a dummy record to memtable before a flush. Otherwise, the
|
|
|
|
// memtable will be empty and flush will be skipped.
|
|
|
|
s = db->Put(write_options, Slice("foo3"), Slice("bar3"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
// after memtable flush we can now release the log
|
|
|
|
ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
|
|
|
|
ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
// deleting transaction should unregister transaction
|
|
|
|
ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
|
|
|
|
}
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
// TODO this test needs to be updated with serial commits
|
|
|
|
TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
|
|
|
|
// mix transaction writes and regular writes
|
|
|
|
const uint32_t NUM_TXN_THREADS = 50;
|
|
|
|
std::atomic<uint32_t> txn_thread_num(0);
|
|
|
|
|
|
|
|
std::function<void()> txn_write_thread = [&]() {
|
|
|
|
uint32_t id = txn_thread_num.fetch_add(1);
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = false;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.lock_timeout = 1000000;
|
|
|
|
if (id % 2 == 0) {
|
|
|
|
txn_options.expiration = 1000000;
|
|
|
|
}
|
|
|
|
TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(id)));
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn->SetName(name));
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
|
|
|
|
ASSERT_OK(txn->Put(key, "val"));
|
|
|
|
}
|
|
|
|
ASSERT_OK(txn->Prepare());
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
delete txn;
|
|
|
|
};
|
|
|
|
|
|
|
|
// assure that all thread are in the same write group
|
|
|
|
std::atomic<uint32_t> t_wait_on_prepare(0);
|
|
|
|
std::atomic<uint32_t> t_wait_on_commit(0);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
|
|
|
|
auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
|
|
|
|
|
|
|
|
if (writer->ShouldWriteToWAL()) {
|
|
|
|
t_wait_on_prepare.fetch_add(1);
|
|
|
|
// wait for friends
|
|
|
|
while (t_wait_on_prepare.load() < NUM_TXN_THREADS) {
|
|
|
|
env->SleepForMicroseconds(10);
|
|
|
|
}
|
|
|
|
} else if (writer->ShouldWriteToMemtable()) {
|
|
|
|
t_wait_on_commit.fetch_add(1);
|
|
|
|
// wait for friends
|
|
|
|
while (t_wait_on_commit.load() < NUM_TXN_THREADS) {
|
|
|
|
env->SleepForMicroseconds(10);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
FAIL();
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// do all the writes
|
|
|
|
std::vector<port::Thread> threads;
|
|
|
|
for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) {
|
|
|
|
threads.emplace_back(txn_write_thread);
|
|
|
|
}
|
|
|
|
for (auto& t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
for (uint32_t t = 0; t < NUM_TXN_THREADS; t++) {
|
|
|
|
TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(t)));
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
|
|
|
|
s = db->Get(read_options, key, &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "val");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = false;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn->SetName("bob");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// transaction put
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// prepare
|
|
|
|
s = txn->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
std::string key(i, 'k');
|
|
|
|
std::string val(1000, 'v');
|
|
|
|
assert(db != nullptr);
|
|
|
|
s = db->Put(write_options, key, val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
if (i % 29 == 0) {
|
|
|
|
// crash
|
|
|
|
env->SetFilesystemActive(false);
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ReOpenNoDelete();
|
|
|
|
} else if (i % 37 == 0) {
|
|
|
|
// close
|
|
|
|
ReOpenNoDelete();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// commit old txn
|
|
|
|
txn = db->GetTransactionByName("bob");
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// verify data txn data
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// verify non txn data
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
std::string key(i, 'k');
|
|
|
|
std::string val(1000, 'v');
|
|
|
|
s = db->Get(read_options, key, &value);
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(value, val);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TwoPhaseSequenceTest) {
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
9 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = false;
|
|
|
|
ReadOptions read_options;
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn->SetName("xid");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// transaction put
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Put(Slice("foo2"), Slice("bar2"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Put(Slice("foo3"), Slice("bar3"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Put(Slice("foo4"), Slice("bar4"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// prepare
|
|
|
|
s = txn->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// make commit
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
// kill and reopen
|
|
|
|
env->SetFilesystemActive(false);
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
9 years ago
|
|
|
ReOpenNoDelete();
|
|
|
|
assert(db != nullptr);
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
9 years ago
|
|
|
|
|
|
|
// value is now available
|
|
|
|
s = db->Get(read_options, "foo4", &value);
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(value, "bar4");
|
|
|
|
}
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
9 years ago
|
|
|
|
|
|
|
TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = false;
|
|
|
|
ReadOptions read_options;
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn->SetName("a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// transaction put
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// prepare
|
|
|
|
s = txn->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
// kill and reopen
|
|
|
|
env->SetFilesystemActive(false);
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ReOpenNoDelete();
|
|
|
|
|
|
|
|
// commit old txn
|
|
|
|
assert(db != nullptr); // Make clang analyze happy.
|
|
|
|
txn = db->GetTransactionByName("a");
|
|
|
|
assert(txn != nullptr);
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn->SetName("b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put(Slice("foo2"), Slice("bar2"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
// kill and reopen
|
|
|
|
env->SetFilesystemActive(false);
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
assert(db != nullptr);
|
|
|
|
|
|
|
|
// value is now available
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
s = db->Get(read_options, "foo2", &value);
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
std::string v;
|
|
|
|
ColumnFamilyHandle *cfa, *cfb;
|
|
|
|
|
|
|
|
// Create 2 new column families
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
WriteOptions wopts;
|
|
|
|
wopts.disableWAL = false;
|
|
|
|
wopts.sync = true;
|
|
|
|
|
|
|
|
TransactionOptions topts1;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(wopts, topts1);
|
|
|
|
s = txn1->SetName("xid1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
TransactionOptions topts2;
|
|
|
|
Transaction* txn2 = db->BeginTransaction(wopts, topts2);
|
|
|
|
s = txn2->SetName("xid2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// transaction put in two column families
|
|
|
|
s = txn1->Put(cfa, "ka1", "va1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// transaction put in two column families
|
|
|
|
s = txn2->Put(cfa, "ka2", "va2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put(cfb, "kb2", "vb2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// write prep section to wal
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// our log should be in the heap
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
|
|
|
|
txn1->GetLogNumber());
|
|
|
|
ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
|
|
|
|
|
|
|
|
// flush default cf to crate new log
|
|
|
|
s = db->Put(wopts, "foo", "bar");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db_impl->TEST_FlushMemTable(true);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// make sure we are on a new log
|
|
|
|
ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
|
|
|
|
|
|
|
|
// put txn2 prep section in this log
|
|
|
|
s = txn2->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
|
|
|
|
|
|
|
|
// heap should still see first log
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
|
|
|
|
txn1->GetLogNumber());
|
|
|
|
|
|
|
|
// commit txn1
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// heap should now show txn2s log
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
|
|
|
|
txn2->GetLogNumber());
|
|
|
|
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
// we should see txn1s log refernced by the memtables
|
|
|
|
ASSERT_EQ(txn1->GetLogNumber(),
|
|
|
|
db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// In these modes memtable do not ref the prep sections
|
|
|
|
ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
// flush default cf to crate new log
|
|
|
|
s = db->Put(wopts, "foo", "bar2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db_impl->TEST_FlushMemTable(true);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// make sure we are on a new log
|
|
|
|
ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
|
|
|
|
|
|
|
|
// commit txn2
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// heap should not show any logs
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
// should show the first txn log
|
|
|
|
ASSERT_EQ(txn1->GetLogNumber(),
|
|
|
|
db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// In these modes memtable do not ref the prep sections
|
|
|
|
ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
// flush only cfa memtable
|
|
|
|
s = db_impl->TEST_FlushMemTable(true, false, cfa);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
// should show the first txn log
|
|
|
|
ASSERT_EQ(txn2->GetLogNumber(),
|
|
|
|
db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// In these modes memtable do not ref the prep sections
|
|
|
|
ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
// flush only cfb memtable
|
|
|
|
s = db_impl->TEST_FlushMemTable(true, false, cfb);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// should show not dependency on logs
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
delete cfa;
|
|
|
|
delete cfb;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
ColumnFamilyHandle *cfa, *cfb;
|
|
|
|
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
WriteOptions wopts;
|
|
|
|
wopts.disableWAL = false;
|
|
|
|
wopts.sync = true;
|
|
|
|
|
|
|
|
auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(cfa);
|
|
|
|
auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(cfb);
|
|
|
|
|
|
|
|
TransactionOptions topts1;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(wopts, topts1);
|
|
|
|
s = txn1->SetName("xid1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put(cfa, "boys", "girls1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(wopts, topts1);
|
|
|
|
s = txn2->SetName("xid2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put(cfb, "up", "down1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// prepre transaction in LOG A
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// prepre transaction in LOG A
|
|
|
|
s = txn2->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// regular put so that mem table can actually be flushed for log rolling
|
|
|
|
s = db->Put(wopts, "cats", "dogs1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
auto prepare_log_no = txn1->GetLastLogNumber();
|
|
|
|
|
|
|
|
// roll to LOG B
|
|
|
|
s = db_impl->TEST_FlushMemTable(true);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// now we pause background work so that
|
|
|
|
// imm()s are not flushed before we can check their status
|
|
|
|
s = db_impl->PauseBackgroundWork();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_GT(db_impl->TEST_LogfileNumber(), prepare_log_no);
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
// This cf is empty and should ref the latest log
|
|
|
|
ASSERT_GT(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
|
|
|
|
ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), db_impl->TEST_LogfileNumber());
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// This cf is not flushed yet and should ref the log that has its data
|
|
|
|
ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
|
|
|
|
txn1->GetLogNumber());
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
|
|
|
|
|
|
|
|
// commit in LOG B
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(),
|
|
|
|
prepare_log_no);
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// In these modes memtable do not ref the prep sections
|
|
|
|
ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
|
|
|
|
|
|
|
|
// request a flush for all column families such that the earliest
|
|
|
|
// alive log file can be killed
|
|
|
|
ASSERT_OK(db_impl->TEST_SwitchWAL());
|
|
|
|
// log cannot be flushed because txn2 has not been commited
|
|
|
|
ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed());
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog());
|
|
|
|
|
|
|
|
// assert that cfa has a flush requested
|
|
|
|
ASSERT_TRUE(cfh_a->cfd()->imm()->HasFlushRequested());
|
|
|
|
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
// cfb should not be flushed becuse it has no data from LOG A
|
|
|
|
ASSERT_TRUE(!cfh_b->cfd()->imm()->HasFlushRequested());
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
// cfb should be flushed becuse it has prepared data from LOG A
|
|
|
|
ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
// cfb now has data from LOG A
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(db_impl->TEST_SwitchWAL());
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
|
|
|
|
|
|
|
|
// we should see that cfb now has a flush requested
|
|
|
|
ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
|
|
|
|
|
|
|
|
// all data in LOG A resides in a memtable that has been
|
|
|
|
// requested for a flush
|
|
|
|
ASSERT_TRUE(db_impl->TEST_IsLogGettingFlushed());
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
delete cfa;
|
|
|
|
delete cfb;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* 1) use prepare to keep first log around to determine starting sequence
|
|
|
|
* during recovery.
|
|
|
|
* 2) insert many values, skipping wal, to increase seqid.
|
|
|
|
* 3) insert final value into wal
|
|
|
|
* 4) recover and see that final value was properly recovered - not
|
|
|
|
* hidden behind improperly summed sequence ids
|
|
|
|
*/
|
|
|
|
TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) {
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
WriteOptions wal_on, wal_off;
|
|
|
|
wal_on.sync = true;
|
|
|
|
wal_on.disableWAL = false;
|
|
|
|
wal_off.disableWAL = true;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(wal_on, txn_options);
|
|
|
|
|
|
|
|
s = txn1->SetName("1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(wal_on, "first", "first");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Put(Slice("dummy"), Slice("dummy"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(wal_off, "cats", "dogs1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Put(wal_off, "cats", "dogs2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Put(wal_off, "cats", "dogs3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db_impl->TEST_FlushMemTable(true);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(wal_on, "cats", "dogs4");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(db->FlushWAL(false));
|
|
|
|
|
|
|
|
// kill and reopen
|
|
|
|
env->SetFilesystemActive(false);
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
assert(db != nullptr);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "first", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "first");
|
|
|
|
|
|
|
|
s = db->Get(read_options, "cats", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "dogs4");
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, FirstWriteTest) {
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
|
|
|
|
// Test conflict checking against the very first write to a db.
|
|
|
|
// The transaction's snapshot will have seq 1 and the following write
|
|
|
|
// will have sequence 1.
|
|
|
|
Status s = db->Put(write_options, "A", "a");
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("A", "b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, FirstWriteTest2) {
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
// Test conflict checking against the very first write to a db.
|
|
|
|
// The transaction's snapshot is a seq 0 while the following write
|
|
|
|
// will have sequence 1.
|
|
|
|
Status s = db->Put(write_options, "A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("A", "b");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, WriteOptionsTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = true;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
ASSERT_TRUE(txn->GetWriteOptions()->sync);
|
|
|
|
|
|
|
|
write_options.sync = false;
|
|
|
|
txn->SetWriteOptions(write_options);
|
|
|
|
ASSERT_FALSE(txn->GetWriteOptions()->sync);
|
|
|
|
ASSERT_TRUE(txn->GetWriteOptions()->disableWAL);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, WriteConflictTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo", "A"));
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo2", "B"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = txn->Put("foo", "A2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("foo2", "B2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// This Put outside of a transaction will conflict with the previous write
|
|
|
|
s = db->Put(write_options, "foo", "xxx");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "A");
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "A2");
|
|
|
|
db->Get(read_options, "foo2", &value);
|
|
|
|
ASSERT_EQ(value, "B2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, WriteConflictTest2) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo", "bar"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
// This Put outside of a transaction will conflict with a later write
|
|
|
|
s = db->Put(write_options, "foo", "barz");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("foo2", "X");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("foo",
|
|
|
|
"bar2"); // Conflicts with write done after snapshot taken
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn->Put("foo3", "Y");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "barz");
|
|
|
|
|
|
|
|
ASSERT_EQ(2, txn->GetNumKeys());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s); // Txn should commit, but only write foo2 and foo3
|
|
|
|
|
|
|
|
// Verify that transaction wrote foo2 and foo3 but not foo
|
|
|
|
db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "barz");
|
|
|
|
|
|
|
|
db->Get(read_options, "foo2", &value);
|
|
|
|
ASSERT_EQ(value, "X");
|
|
|
|
|
|
|
|
db->Get(read_options, "foo3", &value);
|
|
|
|
ASSERT_EQ(value, "Y");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, ReadConflictTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo", "bar"));
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo2", "bar"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// This Put outside of a transaction will conflict with the previous read
|
|
|
|
s = db->Put(write_options, "foo", "barz");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TxnOnlyTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
// Test to make sure transactions work when there are no other writes in an
|
|
|
|
// empty db.
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = txn->Put("x", "y");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, FlushTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar2"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
// Put a random key so we have a memtable to flush
|
|
|
|
s = db->Put(write_options, "dummy", "dummy");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// force a memtable flush
|
|
|
|
FlushOptions flush_ops;
|
|
|
|
db->Flush(flush_ops);
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
// txn should commit since the flushed table is still in MemtableList History
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, FlushTest2) {
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
const size_t num_tests = 3;
|
|
|
|
|
|
|
|
for (size_t n = 0; n < num_tests; n++) {
|
|
|
|
// Test different table factories
|
|
|
|
switch (n) {
|
|
|
|
case 0:
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
options.table_factory.reset(new mock::MockTableFactory());
|
|
|
|
break;
|
|
|
|
case 2: {
|
|
|
|
PlainTableOptions pt_opts;
|
|
|
|
pt_opts.hash_table_ratio = 0;
|
|
|
|
options.table_factory.reset(NewPlainTableFactory(pt_opts));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status s = ReOpen();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
assert(db != nullptr);
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
TransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar2")));
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar3")));
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
s = txn->Put(Slice("foo"), Slice("bar2"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
// verify foo is locked by txn
|
|
|
|
s = db->Delete(write_options, "foo");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
s = db->Put(write_options, "Z", "z");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Put(write_options, "dummy", "dummy");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "S", "s");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->SingleDelete(write_options, "S");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Delete("S");
|
|
|
|
// Should fail after encountering a write to S in memtable
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
// force a memtable flush
|
|
|
|
s = db_impl->TEST_FlushMemTable(true);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Put a random key so we have a MemTable to flush
|
|
|
|
s = db->Put(write_options, "dummy", "dummy2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// force a memtable flush
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
|
|
|
|
s = db->Put(write_options, "dummy", "dummy3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// force a memtable flush
|
|
|
|
// Since our test db has max_write_buffer_number=2, this flush will cause
|
|
|
|
// the first memtable to get purged from the MemtableList history.
|
|
|
|
ASSERT_OK(db_impl->TEST_FlushMemTable(true));
|
|
|
|
|
|
|
|
s = txn->Put("X", "Y");
|
|
|
|
// Should succeed after verifying there is no write to X in SST file
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("Z", "zz");
|
|
|
|
// Should fail after encountering a write to Z in SST file
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
s = txn->GetForUpdate(read_options, "foo2", &value);
|
|
|
|
// should succeed since key was written before txn started
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// verify foo2 is locked by txn
|
|
|
|
s = db->Delete(write_options, "foo2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
s = txn->Delete("S");
|
|
|
|
// Should fail after encountering a write to S in SST file
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
// Write a bunch of keys to db to force a compaction
|
|
|
|
Random rnd(47);
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
s = db->Put(write_options, std::to_string(i),
|
|
|
|
test::CompressibleString(&rnd, 0.8, 100, &value));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
s = txn->Put("X", "yy");
|
|
|
|
// Should succeed after verifying there is no write to X in SST file
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("Z", "zzz");
|
|
|
|
// Should fail after encountering a write to Z in SST file
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
s = txn->Delete("S");
|
|
|
|
// Should fail after encountering a write to S in SST file
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
s = txn->GetForUpdate(read_options, "foo3", &value);
|
|
|
|
// should succeed since key was written before txn started
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// verify foo3 is locked by txn
|
|
|
|
s = db->Delete(write_options, "foo3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
ASSERT_OK(db_impl->TEST_WaitForCompact());
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Transaction should only write the keys that succeeded.
|
|
|
|
s = db->Get(read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
s = db->Get(read_options, "X", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("yy", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Z", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("z", value);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
delete txn;
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
}
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, NoSnapshotTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, "AAA", "bar"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
// Modify key after transaction start
|
|
|
|
ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// Read and write without a snap
|
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
s = txn->Put("AAA", "bar2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Should commit since read/write was done after data changed
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, MultipleSnapshotTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, "AAA", "bar"));
|
|
|
|
ASSERT_OK(db->Put(write_options, "BBB", "bar"));
|
|
|
|
ASSERT_OK(db->Put(write_options, "CCC", "bar"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// Read and write without a snapshot
|
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
s = txn->Put("AAA", "bar2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Modify BBB before snapshot is taken
|
|
|
|
ASSERT_OK(db->Put(write_options, "BBB", "bar1"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
// Read and write with snapshot
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
s = txn->Put("BBB", "bar2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, "CCC", "bar1"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// Set a new snapshot
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
// Read and write with snapshot
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
s = txn->Put("CCC", "bar2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->GetForUpdate(read_options, "AAA", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
s = txn->GetForUpdate(read_options, "BBB", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
s = txn->GetForUpdate(read_options, "CCC", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
s = db->Get(read_options, "AAA", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
s = db->Get(read_options, "BBB", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
s = db->Get(read_options, "CCC", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "AAA", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
s = db->Get(read_options, "BBB", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
s = db->Get(read_options, "CCC", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
// verify that we track multiple writes to the same key at different snapshots
|
|
|
|
delete txn;
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
// Potentially conflicting writes
|
|
|
|
ASSERT_OK(db->Put(write_options, "ZZZ", "zzz"));
|
|
|
|
ASSERT_OK(db->Put(write_options, "XXX", "xxx"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
txn2->SetSnapshot();
|
|
|
|
|
|
|
|
// This should not conflict in txn since the snapshot is later than the
|
|
|
|
// previous write (spoiler alert: it will later conflict with txn2).
|
|
|
|
s = txn->Put("ZZZ", "zzzz");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
// This will conflict since the snapshot is earlier than another write to ZZZ
|
|
|
|
s = txn2->Put("ZZZ", "xxxxx");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "ZZZ", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "zzzz");
|
|
|
|
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, ColumnFamiliesTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
TransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ColumnFamilyHandle *cfa, *cfb;
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
|
|
|
|
// Create 2 new column families
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete cfa;
|
|
|
|
delete cfb;
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// open DB with three column families
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
// have to open default column family
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
|
|
|
|
// open the new column families
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
|
|
|
|
ASSERT_OK(ReOpenNoDelete(column_families, &handles));
|
|
|
|
assert(db != nullptr);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
// Write some data to the db
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(batch.Put("foo", "foo"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = db->Write(write_options, &batch);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(db->Delete(write_options, handles[1], "AAAZZZ"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// These keys do not conflict with existing writes since they're in
|
|
|
|
// different column families
|
|
|
|
s = txn->Delete("AAA");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
Slice key_slice("AAAZZZ");
|
|
|
|
Slice value_slices[2] = {Slice("bar"), Slice("bar")};
|
|
|
|
s = txn->Put(handles[2], SliceParts(&key_slice, 1),
|
|
|
|
SliceParts(value_slices, 2));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(3, txn->GetNumKeys());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Get(read_options, "AAA", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = db->Get(read_options, handles[2], "AAAZZZ", &value);
|
|
|
|
ASSERT_EQ(value, "barbar");
|
|
|
|
|
|
|
|
Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
|
|
|
|
Slice value_slice("barbarbar");
|
|
|
|
|
|
|
|
s = txn2->Delete(handles[2], "XXX");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Delete(handles[1], "XXX");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// This write will cause a conflict with the earlier batch write
|
|
|
|
s = txn2->Put(handles[1], SliceParts(key_slices, 3),
|
|
|
|
SliceParts(&value_slice, 1));
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// In the above the latest change to AAAZZZ in handles[1] is delete.
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = db->Get(read_options, handles[1], "AAAZZZ", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
delete txn;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
|
|
|
|
handles[0], handles[2]};
|
|
|
|
std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
|
|
|
|
std::vector<std::string> values(4);
|
|
|
|
std::vector<Status> results = txn->MultiGetForUpdate(
|
|
|
|
snapshot_read_options, multiget_cfh, multiget_keys, &values);
|
|
|
|
ASSERT_OK(results[0]);
|
|
|
|
ASSERT_OK(results[1]);
|
|
|
|
ASSERT_OK(results[2]);
|
|
|
|
ASSERT_TRUE(results[3].IsNotFound());
|
|
|
|
ASSERT_EQ(values[0], "bar");
|
|
|
|
ASSERT_EQ(values[1], "barbar");
|
|
|
|
ASSERT_EQ(values[2], "foo");
|
|
|
|
|
|
|
|
s = txn->SingleDelete(handles[2], "ZZZ");
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Put(handles[2], "ZZZ", "YYY");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Put(handles[2], "ZZZ", "YYYY");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Delete(handles[2], "ZZZ");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Put(handles[2], "AAAZZZ", "barbarbar");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(5, txn->GetNumKeys());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
// Txn should commit
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Get(read_options, handles[2], "ZZZ", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Put a key which will conflict with the next txn using the previous snapshot
|
|
|
|
ASSERT_OK(db->Put(write_options, handles[2], "foo", "000"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
|
|
|
|
multiget_keys, &values);
|
|
|
|
// All results should fail since there was a conflict
|
|
|
|
ASSERT_TRUE(results[0].IsBusy());
|
|
|
|
ASSERT_TRUE(results[1].IsBusy());
|
|
|
|
ASSERT_TRUE(results[2].IsBusy());
|
|
|
|
ASSERT_TRUE(results[3].IsBusy());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = db->Get(read_options, handles[2], "foo", &value);
|
|
|
|
ASSERT_EQ(value, "000");
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->DropColumnFamily(handles[1]);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->DropColumnFamily(handles[2]);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
for (auto handle : handles) {
|
|
|
|
delete handle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, MultiGetBatchedTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
TransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
ColumnFamilyHandle* cf;
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
|
|
|
|
// Create a new column families
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CF", &cf);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete cf;
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
|
|
|
|
// open DB with three column families
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
// have to open default column family
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
|
|
|
|
// open the new column families
|
|
|
|
cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
column_families.push_back(ColumnFamilyDescriptor("CF", cf_options));
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(column_families, &handles));
|
|
|
|
assert(db != nullptr);
|
|
|
|
|
|
|
|
// Write some data to the db
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "aaa", "val1"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "bbb", "val2"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "ccc", "val3"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "ddd", "foo"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "eee", "val5"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "fff", "val6"));
|
|
|
|
ASSERT_OK(batch.Merge(handles[1], "ggg", "foo"));
|
|
|
|
s = db->Write(write_options, &batch);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
// Write some data to the db
|
|
|
|
s = txn->Delete(handles[1], "bbb");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Put(handles[1], "ccc", "val3_new");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->Merge(handles[1], "ddd", "bar");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
std::vector<Slice> keys = {"aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"};
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<Status> statuses(keys.size());
|
|
|
|
|
|
|
|
txn->MultiGet(snapshot_read_options, handles[1], keys.size(), keys.data(),
|
|
|
|
values.data(), statuses.data());
|
|
|
|
ASSERT_TRUE(statuses[0].ok());
|
|
|
|
ASSERT_EQ(values[0], "val1");
|
|
|
|
ASSERT_TRUE(statuses[1].IsNotFound());
|
|
|
|
ASSERT_TRUE(statuses[2].ok());
|
|
|
|
ASSERT_EQ(values[2], "val3_new");
|
Add Merge Operator support to WriteBatchWithIndex (#8135)
Summary:
The WBWI has two differing modes of operation dependent on the value
of the constructor parameter `overwrite_key`.
Currently, regardless of the parameter, neither mode performs as
expected when using Merge. This PR remedies this by correctly invoking
the appropriate Merge Operator before returning results from the WBWI.
Examples of issues that exist which are solved by this PR:
## Example 1 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `v2`, that is to say that the Merge behaves like a Put.
## Example 2 with o`verwrite_key=true`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 3 with `overwrite_key=false`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 4 with `overwrite_key=true`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v1')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 5 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 6 with `overwrite_key=true`
Currently, from an empty database, `('k1' -> 'v1')`, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8135
Reviewed By: pdillinger
Differential Revision: D27657938
Pulled By: mrambacher
fbshipit-source-id: 0fbda6bbc66bedeba96a84786d90141d776297df
4 years ago
|
|
|
ASSERT_TRUE(statuses[3].ok());
|
|
|
|
ASSERT_EQ(values[3], "foo,bar");
|
|
|
|
ASSERT_TRUE(statuses[4].ok());
|
|
|
|
ASSERT_EQ(values[4], "val5");
|
|
|
|
ASSERT_TRUE(statuses[5].ok());
|
|
|
|
ASSERT_EQ(values[5], "val6");
|
|
|
|
ASSERT_TRUE(statuses[6].ok());
|
|
|
|
ASSERT_EQ(values[6], "foo");
|
|
|
|
delete txn;
|
|
|
|
for (auto handle : handles) {
|
|
|
|
delete handle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// This test calls WriteBatchWithIndex::MultiGetFromBatchAndDB with a large
|
|
|
|
// number of keys, i.e greater than MultiGetContext::MAX_BATCH_SIZE, which is
|
|
|
|
// is 32. This forces autovector allocations in the MultiGet code paths
|
|
|
|
// to use std::vector in addition to stack allocations. The MultiGet keys
|
|
|
|
// includes Merges, which are handled specially in MultiGetFromBatchAndDB by
|
|
|
|
// allocating an autovector of MergeContexts
|
|
|
|
TEST_P(TransactionTest, MultiGetLargeBatchedTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
ColumnFamilyHandle* cf;
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
|
|
|
|
std::vector<std::string> key_str;
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
key_str.emplace_back(std::to_string(i));
|
|
|
|
}
|
|
|
|
// Create a new column families
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CF", &cf);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete cf;
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
|
|
|
|
// open DB with three column families
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
// have to open default column family
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
|
|
|
|
// open the new column families
|
|
|
|
cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
column_families.push_back(ColumnFamilyDescriptor("CF", cf_options));
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(column_families, &handles));
|
|
|
|
assert(db != nullptr);
|
|
|
|
|
|
|
|
// Write some data to the db
|
|
|
|
WriteBatch batch;
|
|
|
|
for (int i = 0; i < 3 * MultiGetContext::MAX_BATCH_SIZE; ++i) {
|
|
|
|
std::string val = "val" + std::to_string(i);
|
|
|
|
ASSERT_OK(batch.Put(handles[1], key_str[i], val));
|
|
|
|
}
|
|
|
|
s = db->Write(write_options, &batch);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
WriteBatchWithIndex wb;
|
|
|
|
// Write some data to the db
|
|
|
|
s = wb.Delete(handles[1], std::to_string(1));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = wb.Put(handles[1], std::to_string(2), "new_val" + std::to_string(2));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// Write a lot of merges so when we call MultiGetFromBatchAndDB later on,
|
|
|
|
// it is forced to use std::vector in ROCKSDB_NAMESPACE::autovector to
|
|
|
|
// allocate MergeContexts. The number of merges needs to be >
|
|
|
|
// MultiGetContext::MAX_BATCH_SIZE
|
|
|
|
for (int i = 8; i < MultiGetContext::MAX_BATCH_SIZE + 24; ++i) {
|
|
|
|
s = wb.Merge(handles[1], std::to_string(i), "merge");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
// MultiGet a lot of keys in order to force std::vector reallocations
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE + 32; ++i) {
|
|
|
|
keys.emplace_back(key_str[i]);
|
|
|
|
}
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<Status> statuses(keys.size());
|
|
|
|
|
|
|
|
wb.MultiGetFromBatchAndDB(db, snapshot_read_options, handles[1], keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), false);
|
|
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
|
|
if (i == 1) {
|
|
|
|
ASSERT_TRUE(statuses[1].IsNotFound());
|
|
|
|
} else if (i == 2) {
|
|
|
|
ASSERT_TRUE(statuses[2].ok());
|
|
|
|
ASSERT_EQ(values[2], "new_val" + std::to_string(2));
|
|
|
|
} else if (i >= 8 && i < 56) {
|
|
|
|
ASSERT_TRUE(statuses[i].ok());
|
|
|
|
ASSERT_EQ(values[i], "val" + std::to_string(i) + ",merge");
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(statuses[i].ok());
|
|
|
|
if (values[i] != "val" + std::to_string(i)) {
|
|
|
|
ASSERT_EQ(values[i], "val" + std::to_string(i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto handle : handles) {
|
|
|
|
delete handle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, MultiGetSnapshot) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions transaction_options;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, transaction_options);
|
|
|
|
|
|
|
|
Slice key = "foo";
|
|
|
|
|
|
|
|
Status s = txn1->Put(key, "bar");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->SetName("test");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Get snapshot between prepare and commit
|
|
|
|
// Un-committed data should be invisible to other transactions
|
|
|
|
const Snapshot* s1 = db->GetSnapshot();
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, transaction_options);
|
|
|
|
ReadOptions read_options;
|
|
|
|
read_options.snapshot = s1;
|
|
|
|
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values(1);
|
|
|
|
std::vector<Status> statuses(1);
|
|
|
|
keys.push_back(key);
|
|
|
|
auto cfd = db->DefaultColumnFamily();
|
|
|
|
txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(),
|
|
|
|
statuses.data());
|
|
|
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
db->ReleaseSnapshot(s1);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, ColumnFamiliesTest2) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
ColumnFamilyHandle *one, *two;
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
|
|
|
|
// Create 2 new column families
|
|
|
|
s = db->CreateColumnFamily(cf_options, "ONE", &one);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->CreateColumnFamily(cf_options, "TWO", &two);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
s = txn1->Put(one, "X", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put(two, "X", "2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put("X", "0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put(one, "X", "11");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Drop first column family
|
|
|
|
s = db->DropColumnFamily(one);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Should fail since column family was dropped.
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
txn1 = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
// Should fail since column family was dropped
|
|
|
|
s = txn1->Put(one, "X", "111");
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
|
|
|
|
s = txn1->Put(two, "X", "222");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Put("X", "000");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, two, "X", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("222", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "X", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("000", value);
|
|
|
|
|
|
|
|
s = db->DropColumnFamily(two);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
delete one;
|
|
|
|
delete two;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, EmptyTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
s = db->Put(write_options, "aaa", "aaa");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_OK(txn->Rollback());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
s = txn->GetForUpdate(read_options, "aaa", &value);
|
|
|
|
ASSERT_EQ(value, "aaa");
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
s = txn->GetForUpdate(read_options, "aaa", &value);
|
|
|
|
ASSERT_EQ(value, "aaa");
|
|
|
|
|
|
|
|
// Conflicts with previous GetForUpdate
|
|
|
|
s = db->Put(write_options, "aaa", "xxx");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// transaction expired!
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, PredicateManyPreceders) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options1, read_options2;
|
|
|
|
TransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options);
|
|
|
|
txn2->SetSnapshot();
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
|
|
|
std::vector<Slice> multiget_keys = {"1", "2", "3"};
|
|
|
|
std::vector<std::string> multiget_values;
|
|
|
|
|
|
|
|
std::vector<Status> results =
|
|
|
|
txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
|
|
|
|
ASSERT_EQ(results.size(), 3);
|
|
|
|
ASSERT_TRUE(results[0].IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_TRUE(results[1].IsNotFound());
|
|
|
|
ASSERT_TRUE(results[2].IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn2->Put("2", "x"); // Conflict's with txn1's MultiGetForUpdate
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_OK(txn2->Rollback());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
multiget_values.clear();
|
|
|
|
results =
|
|
|
|
txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
|
|
|
|
ASSERT_EQ(results.size(), 3);
|
|
|
|
ASSERT_TRUE(results[0].IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_TRUE(results[1].IsNotFound());
|
|
|
|
ASSERT_TRUE(results[2].IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
|
|
|
s = txn1->Put("4", "x");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Delete("4"); // conflict
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options2, "4", &value);
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
ASSERT_OK(txn2->Rollback());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, LostUpdate) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, read_options1, read_options2;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
// Test 2 transactions writing to the same key in multiple orders and
|
|
|
|
// with/without snapshots
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
s = txn1->Put("1", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("1", "2"); // conflict
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "1", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("1", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
|
|
|
s = txn1->Put("1", "3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("1", "4"); // conflict
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "1", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("3", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
|
|
|
s = txn1->Put("1", "5");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("1", "6");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "1", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("5", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
|
|
|
s = txn1->Put("1", "7");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn2->SetSnapshot();
|
|
|
|
s = txn2->Put("1", "8");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "1", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("8", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options);
|
|
|
|
txn2 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
s = txn1->Put("1", "9");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("1", "10");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
s = db->Get(read_options, "1", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "10");
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, UntrackedWrites) {
|
|
|
|
if (txn_db_options.write_policy == WRITE_UNPREPARED) {
|
|
|
|
// TODO(lth): For WriteUnprepared, validate that untracked writes are
|
|
|
|
// not supported.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
// Verify transaction rollback works for untracked keys.
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
s = txn->PutUntracked("untracked", "0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(txn->Rollback());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = db->Get(read_options, "untracked", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
s = db->Put(write_options, "untracked", "x");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Untracked writes should succeed even though key was written after snapshot
|
|
|
|
s = txn->PutUntracked("untracked", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn->MergeUntracked("untracked", "2");
|
|
|
|
ASSERT_OK(s);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn->DeleteUntracked("untracked");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Conflict
|
|
|
|
s = txn->Put("untracked", "3");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "untracked", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, ExpiredTransaction) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
// Set txn expiration timeout to 0 microseconds (expires instantly)
|
|
|
|
txn_options.expiration = 0;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
|
|
|
|
s = txn1->Put("X", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Put("Y", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
// txn2 should be able to write to X since txn1 has expired
|
|
|
|
s = txn2->Put("X", "2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Get(read_options, "X", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("2", value);
|
|
|
|
|
|
|
|
s = txn1->Put("Z", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// txn1 should fail to commit since it is expired
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_TRUE(s.IsExpired());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = db->Get(read_options, "Y", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Z", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, ReinitializeTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
// Set txn expiration timeout to 0 microseconds (expires instantly)
|
|
|
|
txn_options.expiration = 0;
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
|
|
|
|
// Reinitialize transaction to no long expire
|
|
|
|
txn_options.expiration = -1;
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
|
|
|
|
s = txn1->Put("Z", "z");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Should commit since not expired
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
|
|
|
|
s = txn1->Put("Z", "zz");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Reinitilize txn1 and verify that Z gets unlocked
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options, nullptr);
|
|
|
|
s = txn2->Put("Z", "zzz");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Z", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "zzz");
|
|
|
|
|
|
|
|
// Verify snapshots get reinitialized correctly
|
|
|
|
txn1->SetSnapshot();
|
|
|
|
s = txn1->Put("Z", "zzzz");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Z", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "zzzz");
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
const Snapshot* snapshot = txn1->GetSnapshot();
|
|
|
|
ASSERT_FALSE(snapshot);
|
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
snapshot = txn1->GetSnapshot();
|
|
|
|
ASSERT_TRUE(snapshot);
|
|
|
|
|
|
|
|
s = txn1->Put("Z", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
|
|
|
|
|
|
|
s = txn1->Put("Y", "y");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn_options.set_snapshot = false;
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
snapshot = txn1->GetSnapshot();
|
|
|
|
ASSERT_FALSE(snapshot);
|
|
|
|
|
|
|
|
s = txn1->Put("X", "x");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Z", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "zzzz");
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Y", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
|
|
|
|
s = txn1->SetName("name");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options, txn1);
|
|
|
|
|
|
|
|
s = txn1->SetName("name");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, Rollback) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Put("X", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
// txn2 should not be able to write to X since txn1 has it locked
|
|
|
|
s = txn2->Put("X", "2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
// txn2 should now be able to write to X
|
|
|
|
s = txn2->Put("X", "3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "X", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("3", value);
|
|
|
|
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, LockLimitTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
TransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// Open DB with a lock limit of 3
|
|
|
|
txn_db_options.max_num_locks = 3;
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
assert(db != nullptr);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Create a txn and verify we can only lock up to 3 keys
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = txn->Put("X", "x");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("Y", "y");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("Z", "z");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// lock limit reached
|
|
|
|
s = txn->Put("W", "w");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
// re-locking same key shouldn't put us over the limit
|
|
|
|
s = txn->Put("X", "xx");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->GetForUpdate(read_options, "W", &value);
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
s = txn->GetForUpdate(read_options, "V", &value);
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
// re-locking same key shouldn't put us over the limit
|
|
|
|
s = txn->GetForUpdate(read_options, "Y", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("y", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "W", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
// "X" currently locked
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn2->Put("X", "x");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// lock limit reached
|
|
|
|
s = txn2->Put("M", "m");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "X", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("xx", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "W", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Committing txn should release its locks and allow txn2 to proceed
|
|
|
|
s = txn2->Put("X", "x2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Delete("X");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("M", "m");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("Z", "z2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// lock limit reached
|
|
|
|
s = txn2->Delete("Y");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Z", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("z2", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "Y", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("y", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "X", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, IteratorTest) {
|
|
|
|
// This test does writes without snapshot validation, and then tries to create
|
|
|
|
// iterator later, which is unsupported in write unprepared.
|
|
|
|
if (txn_db_options.write_policy == WRITE_UNPREPARED) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
// Write some keys to the db
|
|
|
|
s = db->Put(write_options, "A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "G", "g");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "F", "f");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "C", "c");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "D", "d");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
// Write some keys in a txn
|
|
|
|
s = txn->Put("B", "b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("H", "h");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Delete("D");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("E", "e");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
const Snapshot* snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
// Write some keys to the db after the snapshot
|
|
|
|
s = db->Put(write_options, "BB", "xx");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "C", "xx");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
read_options.snapshot = snapshot;
|
|
|
|
Iterator* iter = txn->GetIterator(read_options);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
iter->SeekToFirst();
|
|
|
|
|
|
|
|
// Read all keys via iter and lock them all
|
|
|
|
std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
|
|
|
|
for (int i = 0; i < 7; i++) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(results[i], iter->value().ToString());
|
|
|
|
|
|
|
|
s = txn->GetForUpdate(read_options, iter->key(), nullptr);
|
|
|
|
if (i == 2) {
|
|
|
|
// "C" was modified after txn's snapshot
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
iter->Seek("G");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("g", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("f", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("D");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("e", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("C");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("c", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("e", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("a", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("X");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("h", iter->value().ToString());
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, DisableIndexingTest) {
|
|
|
|
// Skip this test for write unprepared. It does not solely rely on WBWI for
|
|
|
|
// read your own writes, so depending on whether batches are flushed or not,
|
|
|
|
// only some writes will be visible.
|
|
|
|
//
|
|
|
|
// Also, write unprepared does not support creating iterators if there has
|
|
|
|
// been txn->Put() without snapshot validation.
|
|
|
|
if (txn_db_options.write_policy == WRITE_UNPREPARED) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = txn->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
txn->DisableIndexing();
|
|
|
|
|
|
|
|
s = txn->Put("B", "b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "B", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
Iterator* iter = txn->GetIterator(read_options);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
|
|
|
|
iter->Seek("B");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
s = txn->Delete("A");
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
txn->EnableIndexing();
|
|
|
|
|
|
|
|
s = txn->Put("B", "bb");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
iter->Seek("B");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bb", iter->value().ToString());
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("bb", value);
|
|
|
|
|
|
|
|
s = txn->Put("A", "aa");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("aa", value);
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SavepointTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
ASSERT_EQ(0, txn->GetNumPuts());
|
|
|
|
|
|
|
|
s = txn->RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
txn->SetSavePoint(); // 1
|
|
|
|
|
|
|
|
ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn
|
|
|
|
s = txn->RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn->Put("B", "b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(1, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(0, txn->GetNumDeletes());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = txn->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("B", "bb");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("C", "c");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn->SetSavePoint(); // 2
|
|
|
|
|
|
|
|
s = txn->Delete("B");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("C", "cc");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("D", "d");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(5, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(1, txn->GetNumDeletes());
|
|
|
|
|
|
|
|
ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_EQ(3, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(0, txn->GetNumDeletes());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("bb", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "C", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("c", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("E", "e");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(5, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(0, txn->GetNumDeletes());
|
|
|
|
|
|
|
|
// Rollback to beginning of txn
|
|
|
|
s = txn->RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_OK(txn->Rollback());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_EQ(0, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(0, txn->GetNumDeletes());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "E", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Put("A", "aa");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("F", "f");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_EQ(2, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(0, txn->GetNumDeletes());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
txn->SetSavePoint(); // 3
|
|
|
|
txn->SetSavePoint(); // 4
|
|
|
|
|
|
|
|
s = txn->Put("G", "g");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->SingleDelete("F");
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Delete("B");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("aa", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "F", &value);
|
|
|
|
// According to db.h, doing a SingleDelete on a key that has been
|
|
|
|
// overwritten will have undefinied behavior. So it is unclear what the
|
|
|
|
// result of fetching "F" should be. The current implementation will
|
|
|
|
// return NotFound in this case.
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "B", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
ASSERT_EQ(3, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(2, txn->GetNumDeletes());
|
|
|
|
|
|
|
|
ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
ASSERT_EQ(2, txn->GetNumPuts());
|
|
|
|
ASSERT_EQ(0, txn->GetNumDeletes());
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn->Get(read_options, "F", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("f", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "G", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "F", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("f", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "G", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("aa", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "C", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = db->Get(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = db->Get(read_options, "E", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SavepointTest2) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1; // 1 ms
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
s = txn1->Put("A", "");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 1
|
|
|
|
|
|
|
|
s = txn1->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Put("C", "c");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 2
|
|
|
|
|
|
|
|
s = txn1->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put("B", "b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 2
|
|
|
|
|
|
|
|
// Verify that "A" and "C" is still locked while "B" is not
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
s = txn2->Put("A", "a2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Put("A", "aa");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put("B", "bb");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
s = txn1->Put("A", "aaa");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put("B", "bbb");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put("C", "ccc");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 3
|
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 3
|
|
|
|
|
|
|
|
// Verify that "A", "B", "C" are still locked
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
s = txn2->Put("A", "a2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1
|
|
|
|
|
|
|
|
// Verify that only "A" is locked
|
|
|
|
s = txn2->Put("A", "a3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("C", "c3po");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
// Verify "A" "C" "B" are no longer locked
|
|
|
|
s = txn2->Put("A", "a4");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("B", "b4");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("C", "c4");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SavepointTest3) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1; // 1 ms
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
s = txn1->PopSavePoint(); // No SavePoint present
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn1->Put("A", "");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->PopSavePoint(); // Still no SavePoint present
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 1
|
|
|
|
|
|
|
|
s = txn1->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->PopSavePoint(); // Remove 1
|
|
|
|
ASSERT_TRUE(txn1->RollbackToSavePoint().IsNotFound());
|
|
|
|
|
|
|
|
// Verify that "A" is still locked
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
s = txn2->Put("A", "a2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 2
|
|
|
|
|
|
|
|
s = txn1->Put("B", "b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 3
|
|
|
|
|
|
|
|
s = txn1->Put("B", "b2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint()); // Roll back to 2
|
|
|
|
|
|
|
|
s = txn1->PopSavePoint();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->PopSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
|
|
|
|
// tnx1 should have modified "A" to "a"
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
// tnx1 should have set "B" to just "b"
|
|
|
|
s = db->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "C", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SavepointTest4) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1; // 1 ms
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 1
|
|
|
|
s = txn1->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 2
|
|
|
|
s = txn1->Put("B", "b");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->PopSavePoint(); // Remove 2
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Verify that A/B still exists.
|
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(txn1->Get(read_options, "A", &value));
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Get(read_options, "B", &value));
|
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1
|
|
|
|
|
|
|
|
// Verify that everything was rolled back.
|
|
|
|
s = txn1->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn1->Get(read_options, "B", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Nothing should be locked
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
s = txn2->Put("A", "");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("B", "");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn2;
|
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, UndoGetForUpdateTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1; // 1 ms
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
s = txn1->GetForUpdate(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Verify that A is locked
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn2->Put("A", "a");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
// Verify that A is now unlocked
|
|
|
|
s = txn2->Put("A", "a2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(txn2->Commit());
|
|
|
|
delete txn2;
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a2", value);
|
|
|
|
|
|
|
|
s = txn1->Delete("A");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->GetForUpdate(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn1->Put("B", "b3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->GetForUpdate(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("B");
|
|
|
|
|
|
|
|
// Verify that A and B are still locked
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn2->Put("A", "a4");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b4");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->Rollback());
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
// Verify that A and B are no longer locked
|
|
|
|
s = txn2->Put("A", "a5");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("B", "b5");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Commit();
|
|
|
|
delete txn2;
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->GetForUpdate(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->GetForUpdate(read_options, "C", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = txn1->GetForUpdate(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->GetForUpdate(read_options, "C", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = txn1->GetForUpdate(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Put("B", "b5");
|
|
|
|
s = txn1->GetForUpdate(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("B");
|
|
|
|
txn1->UndoGetForUpdate("C");
|
|
|
|
txn1->UndoGetForUpdate("X");
|
|
|
|
|
|
|
|
// Verify A,B,C are locked
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn2->Put("A", "a6");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Delete("B");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c6");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("X", "x6");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("B");
|
|
|
|
txn1->UndoGetForUpdate("C");
|
|
|
|
txn1->UndoGetForUpdate("X");
|
|
|
|
|
|
|
|
// Verify A,B are locked and C is not
|
|
|
|
s = txn2->Put("A", "a6");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Delete("B");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c6");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("X", "x6");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("B");
|
|
|
|
txn1->UndoGetForUpdate("C");
|
|
|
|
txn1->UndoGetForUpdate("X");
|
|
|
|
|
|
|
|
// Verify B is locked and A and C are not
|
|
|
|
s = txn2->Put("A", "a7");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Delete("B");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c7");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("X", "x7");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, UndoGetForUpdateTest2) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
s = db->Put(write_options, "A", "");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn_options.lock_timeout = 1; // 1 ms
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->GetForUpdate(read_options, "B", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn1->Put("F", "f");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 1
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "C", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = txn1->GetForUpdate(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn1->Put("E", "e");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->GetForUpdate(read_options, "E", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "F", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Verify A,B,C,D,E,F are still locked
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
s = txn2->Put("A", "a1");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b1");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c1");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("D", "d1");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("E", "e1");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("F", "f1");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("C");
|
|
|
|
txn1->UndoGetForUpdate("E");
|
|
|
|
|
|
|
|
// Verify A,B,D,E,F are still locked and C is not.
|
|
|
|
s = txn2->Put("A", "a2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("D", "d2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("E", "e2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("F", "f2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 2
|
|
|
|
|
|
|
|
s = txn1->Put("H", "h");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("B");
|
|
|
|
txn1->UndoGetForUpdate("C");
|
|
|
|
txn1->UndoGetForUpdate("D");
|
|
|
|
txn1->UndoGetForUpdate("E");
|
|
|
|
txn1->UndoGetForUpdate("F");
|
|
|
|
txn1->UndoGetForUpdate("G");
|
|
|
|
txn1->UndoGetForUpdate("H");
|
|
|
|
|
|
|
|
// Verify A,B,D,E,F,H are still locked and C,G are not.
|
|
|
|
s = txn2->Put("A", "a3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("D", "d3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("E", "e3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("F", "f3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("H", "h3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("G", "g3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 2
|
|
|
|
|
|
|
|
// Verify A,B,D,E,F are still locked and C,G,H are not.
|
|
|
|
s = txn2->Put("A", "a3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("D", "d3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("E", "e3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("F", "f3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("G", "g3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("H", "h3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("B");
|
|
|
|
txn1->UndoGetForUpdate("C");
|
|
|
|
txn1->UndoGetForUpdate("D");
|
|
|
|
txn1->UndoGetForUpdate("E");
|
|
|
|
txn1->UndoGetForUpdate("F");
|
|
|
|
txn1->UndoGetForUpdate("G");
|
|
|
|
txn1->UndoGetForUpdate("H");
|
|
|
|
|
|
|
|
// Verify A,B,E,F are still locked and C,D,G,H are not.
|
|
|
|
s = txn2->Put("A", "a3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("E", "e3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("F", "f3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("D", "d3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("G", "g3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("H", "h3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 1
|
|
|
|
|
|
|
|
// Verify A,B,F are still locked and C,D,E,G,H are not.
|
|
|
|
s = txn2->Put("A", "a3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("B", "b3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("F", "f3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("C", "c3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("D", "d3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("E", "e3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("G", "g3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("H", "h3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("B");
|
|
|
|
txn1->UndoGetForUpdate("C");
|
|
|
|
txn1->UndoGetForUpdate("D");
|
|
|
|
txn1->UndoGetForUpdate("E");
|
|
|
|
txn1->UndoGetForUpdate("F");
|
|
|
|
txn1->UndoGetForUpdate("G");
|
|
|
|
txn1->UndoGetForUpdate("H");
|
|
|
|
|
|
|
|
// Verify F is still locked and A,B,C,D,E,G,H are not.
|
|
|
|
s = txn2->Put("F", "f3");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Put("A", "a3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("B", "b3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("C", "c3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("D", "d3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("E", "e3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("G", "g3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Put("H", "h3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, TimeoutTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
Status s;
|
|
|
|
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
// transaction writes have an infinite timeout,
|
|
|
|
// but we will override this when we start a txn
|
|
|
|
// db writes have infinite timeout
|
|
|
|
txn_db_options.transaction_lock_timeout = -1;
|
|
|
|
txn_db_options.default_lock_timeout = -1;
|
|
|
|
|
|
|
|
s = TransactionDB::Open(options, txn_db_options, dbname, &db);
|
|
|
|
assert(db != nullptr);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "aaa", "aaa");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
TransactionOptions txn_options0;
|
|
|
|
txn_options0.expiration = 100; // 100ms
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
txn_options0.lock_timeout = 50; // txn timeout no longer infinite
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options0);
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "aaa", nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Conflicts with previous GetForUpdate.
|
|
|
|
// Since db writes do not have a timeout, this should eventually succeed when
|
|
|
|
// the transaction expires.
|
|
|
|
s = db->Put(write_options, "aaa", "xxx");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ASSERT_GE(txn1->GetElapsedTime(),
|
|
|
|
static_cast<uint64_t>(txn_options0.expiration));
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_TRUE(s.IsExpired()); // expired!
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = db->Get(read_options, "aaa", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("xxx", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete db;
|
|
|
|
|
|
|
|
// transaction writes have 10ms timeout,
|
|
|
|
// db writes have infinite timeout
|
|
|
|
txn_db_options.transaction_lock_timeout = 50;
|
|
|
|
txn_db_options.default_lock_timeout = -1;
|
|
|
|
|
|
|
|
s = TransactionDB::Open(options, txn_db_options, dbname, &db);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "aaa", "aaa");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.expiration = 100; // 100ms
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "aaa", nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Conflicts with previous GetForUpdate.
|
|
|
|
// Since db writes do not have a timeout, this should eventually succeed when
|
|
|
|
// the transaction expires.
|
|
|
|
s = db->Put(write_options, "aaa", "xxx");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_NOK(s); // expired!
|
|
|
|
|
|
|
|
s = db->Get(read_options, "aaa", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("xxx", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
txn_options.expiration = 6000000; // 100 minutes
|
|
|
|
txn_options.lock_timeout = 1; // 1ms
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
txn1->SetLockTimeout(100);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
TransactionOptions txn_options2;
|
|
|
|
txn_options2.expiration = 10; // 10ms
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options2);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("a", "2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// txn1 has a lock timeout longer than txn2's expiration, so it will win
|
|
|
|
s = txn1->Delete("a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// txn2 should be expired out since txn1 waiting until its timeout expired.
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_TRUE(s.IsExpired());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
txn_options.expiration = 6000000; // 100 minutes
|
|
|
|
txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
txn_options2.expiration = 100000000;
|
|
|
|
txn2 = db->BeginTransaction(write_options, txn_options2);
|
|
|
|
|
|
|
|
s = txn1->Delete("asdf");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// txn2 has a smaller lock timeout than txn1's expiration, so it will time out
|
|
|
|
s = txn2->Delete("asdf");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Put("asdf", "asdf");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "asdf", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("asdf", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SingleDeleteTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = txn->SingleDelete("A");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
s = txn->SingleDelete("A");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
s = txn->SingleDelete("A");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
txn = db->BeginTransaction(write_options);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options);
|
|
|
|
txn2->SetSnapshot();
|
|
|
|
|
|
|
|
s = txn->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Put("A", "a2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->SingleDelete("A");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->SingleDelete("B");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// According to db.h, doing a SingleDelete on a key that has been
|
|
|
|
// overwritten will have undefinied behavior. So it is unclear what the
|
|
|
|
// result of fetching "A" should be. The current implementation will
|
|
|
|
// return NotFound in this case.
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn2->Put("B", "b");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
// According to db.h, doing a SingleDelete on a key that has been
|
|
|
|
// overwritten will have undefinied behavior. So it is unclear what the
|
|
|
|
// result of fetching "A" should be. The current implementation will
|
|
|
|
// return NotFound in this case.
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = db->Get(read_options, "B", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, MergeTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "A", "a0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Merge("A", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Merge("A", "2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
Add Merge Operator support to WriteBatchWithIndex (#8135)
Summary:
The WBWI has two differing modes of operation dependent on the value
of the constructor parameter `overwrite_key`.
Currently, regardless of the parameter, neither mode performs as
expected when using Merge. This PR remedies this by correctly invoking
the appropriate Merge Operator before returning results from the WBWI.
Examples of issues that exist which are solved by this PR:
## Example 1 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `v2`, that is to say that the Merge behaves like a Put.
## Example 2 with o`verwrite_key=true`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 3 with `overwrite_key=false`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 4 with `overwrite_key=true`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v1')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 5 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 6 with `overwrite_key=true`
Currently, from an empty database, `('k1' -> 'v1')`, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8135
Reviewed By: pdillinger
Differential Revision: D27657938
Pulled By: mrambacher
fbshipit-source-id: 0fbda6bbc66bedeba96a84786d90141d776297df
4 years ago
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a0,1,2", value);
|
|
|
|
|
|
|
|
s = txn->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a", value);
|
|
|
|
|
|
|
|
s = txn->Merge("A", "3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
Add Merge Operator support to WriteBatchWithIndex (#8135)
Summary:
The WBWI has two differing modes of operation dependent on the value
of the constructor parameter `overwrite_key`.
Currently, regardless of the parameter, neither mode performs as
expected when using Merge. This PR remedies this by correctly invoking
the appropriate Merge Operator before returning results from the WBWI.
Examples of issues that exist which are solved by this PR:
## Example 1 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `v2`, that is to say that the Merge behaves like a Put.
## Example 2 with o`verwrite_key=true`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 3 with `overwrite_key=false`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 4 with `overwrite_key=true`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v1')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 5 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 6 with `overwrite_key=true`
Currently, from an empty database, `('k1' -> 'v1')`, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8135
Reviewed By: pdillinger
Differential Revision: D27657938
Pulled By: mrambacher
fbshipit-source-id: 0fbda6bbc66bedeba96a84786d90141d776297df
4 years ago
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a,3", value);
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.lock_timeout = 1; // 1 ms
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
// verify that txn has "A" locked
|
|
|
|
s = txn2->Merge("A", "4");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a,3", value);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, DeleteRangeSupportTest) {
|
|
|
|
// The `DeleteRange()` API is banned everywhere.
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b")
|
|
|
|
.IsNotSupported());
|
|
|
|
|
|
|
|
// But range deletions can be added via the `Write()` API by specifying the
|
|
|
|
// proper flags to promise there are no conflicts according to the DB type
|
|
|
|
// (see `TransactionDB::DeleteRange()` API doc for details).
|
|
|
|
for (bool skip_concurrency_control : {false, true}) {
|
|
|
|
for (bool skip_duplicate_key_check : {false, true}) {
|
|
|
|
ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
|
|
|
|
WriteBatch wb;
|
|
|
|
ASSERT_OK(wb.DeleteRange("a", "b"));
|
|
|
|
TransactionDBWriteOptimizations flags;
|
|
|
|
flags.skip_concurrency_control = skip_concurrency_control;
|
|
|
|
flags.skip_duplicate_key_check = skip_duplicate_key_check;
|
|
|
|
Status s = db->Write(WriteOptions(), flags, &wb);
|
|
|
|
std::string value;
|
|
|
|
switch (txn_db_options.write_policy) {
|
|
|
|
case WRITE_COMMITTED:
|
|
|
|
if (skip_concurrency_control) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
|
|
|
|
} else {
|
|
|
|
ASSERT_NOK(s);
|
|
|
|
ASSERT_OK(db->Get(ReadOptions(), "a", &value));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case WRITE_PREPARED:
|
|
|
|
// Intentional fall-through
|
|
|
|
case WRITE_UNPREPARED:
|
|
|
|
if (skip_concurrency_control && skip_duplicate_key_check) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
|
|
|
|
} else {
|
|
|
|
ASSERT_NOK(s);
|
|
|
|
ASSERT_OK(db->Get(ReadOptions(), "a", &value));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Without any promises from the user, range deletion via other `Write()`
|
|
|
|
// APIs are still banned.
|
|
|
|
ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
|
|
|
|
ASSERT_NOK(db->Write(WriteOptions(), &wb));
|
|
|
|
ASSERT_OK(db->Get(ReadOptions(), "a", &value));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, DeferSnapshotTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
s = db->Put(write_options, "A", "a0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options);
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
txn1->SetSnapshotOnNextOperation();
|
|
|
|
auto snapshot = txn1->GetSnapshot();
|
|
|
|
ASSERT_FALSE(snapshot);
|
|
|
|
|
|
|
|
s = txn2->Put("A", "a2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "A", &value);
|
|
|
|
// Should not conflict with txn2 since snapshot wasn't set until
|
|
|
|
// GetForUpdate was called.
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a2", value);
|
|
|
|
|
|
|
|
s = txn1->Put("A", "a1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "B", "b0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Cannot lock B since it was written after the snapshot was set
|
|
|
|
s = txn1->Put("B", "b1");
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
s = db->Get(read_options, "A", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("a1", value);
|
|
|
|
|
|
|
|
s = db->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("b0", value);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, DeferSnapshotTest2) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
txn1->SetSnapshot();
|
|
|
|
|
|
|
|
s = txn1->Put("A", "a1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "C", "c0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->Put(write_options, "D", "d0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn1->SetSnapshotOnNextOperation();
|
|
|
|
|
|
|
|
s = txn1->Get(snapshot_read_options, "C", &value);
|
|
|
|
// Snapshot was set before C was written
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = txn1->Get(snapshot_read_options, "D", &value);
|
|
|
|
// Snapshot was set before D was written
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Snapshot should not have changed yet.
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
s = txn1->Get(snapshot_read_options, "C", &value);
|
|
|
|
// Snapshot was set before C was written
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = txn1->Get(snapshot_read_options, "D", &value);
|
|
|
|
// Snapshot was set before D was written
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn1->GetForUpdate(read_options, "C", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("c0", value);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "D", "d00");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Snapshot is now set
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
s = txn1->Get(snapshot_read_options, "D", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("d0", value);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, DeferSnapshotSavePointTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 1
|
|
|
|
|
|
|
|
s = db->Put(write_options, "T", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSnapshotOnNextOperation();
|
|
|
|
|
|
|
|
s = db->Put(write_options, "T", "2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 2
|
|
|
|
|
|
|
|
s = db->Put(write_options, "T", "3");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Put("A", "a");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 3
|
|
|
|
|
|
|
|
s = db->Put(write_options, "T", "4");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
txn1->SetSnapshot();
|
|
|
|
txn1->SetSnapshotOnNextOperation();
|
|
|
|
|
|
|
|
txn1->SetSavePoint(); // 4
|
|
|
|
|
|
|
|
s = db->Put(write_options, "T", "5");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
s = txn1->Get(snapshot_read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("4", value);
|
|
|
|
|
|
|
|
s = txn1->Put("A", "a1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
s = txn1->Get(snapshot_read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("5", value);
|
|
|
|
|
|
|
|
s = txn1->RollbackToSavePoint(); // Rollback to 4
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
s = txn1->Get(snapshot_read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("4", value);
|
|
|
|
|
|
|
|
s = txn1->RollbackToSavePoint(); // Rollback to 3
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
s = txn1->Get(snapshot_read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("3", value);
|
|
|
|
|
|
|
|
s = txn1->Get(read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("5", value);
|
|
|
|
|
|
|
|
s = txn1->RollbackToSavePoint(); // Rollback to 2
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
ASSERT_FALSE(snapshot_read_options.snapshot);
|
|
|
|
s = txn1->Get(snapshot_read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("5", value);
|
|
|
|
|
|
|
|
s = txn1->Delete("A");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
ASSERT_TRUE(snapshot_read_options.snapshot);
|
|
|
|
s = txn1->Get(snapshot_read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("5", value);
|
|
|
|
|
|
|
|
s = txn1->RollbackToSavePoint(); // Rollback to 1
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
s = txn1->Delete("A");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn1->GetSnapshot();
|
|
|
|
ASSERT_FALSE(snapshot_read_options.snapshot);
|
|
|
|
s = txn1->Get(snapshot_read_options, "T", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("5", value);
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
|
|
|
|
class Notifier : public TransactionNotifier {
|
|
|
|
private:
|
|
|
|
const Snapshot** snapshot_ptr_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
explicit Notifier(const Snapshot** snapshot_ptr)
|
|
|
|
: snapshot_ptr_(snapshot_ptr) {}
|
|
|
|
|
|
|
|
void SnapshotCreated(const Snapshot* newSnapshot) override {
|
|
|
|
*snapshot_ptr_ = newSnapshot;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
std::shared_ptr<Notifier> notifier =
|
|
|
|
std::make_shared<Notifier>(&read_options.snapshot);
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
s = db->Put(write_options, "B", "0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
txn1->SetSnapshotOnNextOperation(notifier);
|
|
|
|
ASSERT_FALSE(read_options.snapshot);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "B", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// A Get does not generate the snapshot
|
|
|
|
s = txn1->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_FALSE(read_options.snapshot);
|
|
|
|
ASSERT_EQ(value, "1");
|
|
|
|
|
|
|
|
// Any other operation does
|
|
|
|
s = txn1->Put("A", "0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Now change "B".
|
|
|
|
s = db->Put(write_options, "B", "2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// The original value should still be read
|
|
|
|
s = txn1->Get(read_options, "B", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(read_options.snapshot);
|
|
|
|
ASSERT_EQ(value, "1");
|
|
|
|
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, ClearSnapshotTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
s = db->Put(write_options, "foo", "0");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "foo", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
ASSERT_FALSE(snapshot_read_options.snapshot);
|
|
|
|
|
|
|
|
// No snapshot created yet
|
|
|
|
s = txn->Get(snapshot_read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "1");
|
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
ASSERT_TRUE(snapshot_read_options.snapshot);
|
|
|
|
|
|
|
|
s = db->Put(write_options, "foo", "2");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Snapshot was created before change to '2'
|
|
|
|
s = txn->Get(snapshot_read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "1");
|
|
|
|
|
|
|
|
txn->ClearSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
ASSERT_FALSE(snapshot_read_options.snapshot);
|
|
|
|
|
|
|
|
// Snapshot has now been cleared
|
|
|
|
s = txn->Get(snapshot_read_options, "foo", &value);
|
|
|
|
ASSERT_EQ(value, "2");
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, ToggleAutoCompactionTest) {
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
ColumnFamilyHandle *cfa, *cfb;
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
|
|
|
|
// Create 2 new column families
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete cfa;
|
|
|
|
delete cfb;
|
|
|
|
delete db;
|
|
|
|
|
|
|
|
// open DB with three column families
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
// have to open default column family
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
|
|
|
|
// open the new column families
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
|
|
|
|
|
|
|
|
ColumnFamilyOptions* cf_opt_default = &column_families[0].options;
|
|
|
|
ColumnFamilyOptions* cf_opt_cfa = &column_families[1].options;
|
|
|
|
ColumnFamilyOptions* cf_opt_cfb = &column_families[2].options;
|
|
|
|
cf_opt_default->disable_auto_compactions = false;
|
|
|
|
cf_opt_cfa->disable_auto_compactions = true;
|
|
|
|
cf_opt_cfb->disable_auto_compactions = false;
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
|
|
|
|
s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
|
|
|
|
&handles, &db);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
auto cfh_default = static_cast_with_check<ColumnFamilyHandleImpl>(handles[0]);
|
|
|
|
auto opt_default = *cfh_default->cfd()->GetLatestMutableCFOptions();
|
|
|
|
|
|
|
|
auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(handles[1]);
|
|
|
|
auto opt_a = *cfh_a->cfd()->GetLatestMutableCFOptions();
|
|
|
|
|
|
|
|
auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(handles[2]);
|
|
|
|
auto opt_b = *cfh_b->cfd()->GetLatestMutableCFOptions();
|
|
|
|
|
|
|
|
ASSERT_EQ(opt_default.disable_auto_compactions, false);
|
|
|
|
ASSERT_EQ(opt_a.disable_auto_compactions, true);
|
|
|
|
ASSERT_EQ(opt_b.disable_auto_compactions, false);
|
|
|
|
|
|
|
|
for (auto handle : handles) {
|
|
|
|
delete handle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) {
|
|
|
|
// In this test, txn1 should succeed committing,
|
|
|
|
// as the callback is called after txn1 starts committing.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"TransactionTest::ExpirableTransactionDataRace:1"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"TransactionTest::ExpirableTransactionDataRace:1", [&](void* /*arg*/) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
// Force txn1 to expire
|
|
|
|
/* sleep override */
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(1500));
|
|
|
|
|
|
|
|
Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
Status s;
|
|
|
|
s = txn2->Put("X", "2");
|
|
|
|
ASSERT_TRUE(s.IsTimedOut());
|
|
|
|
s = txn2->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete txn2;
|
|
|
|
});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
|
|
|
|
txn_options.expiration = 1000; // 1 second
|
|
|
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
s = txn1->Put("X", "1");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
|
|
|
s = db->Get(read_options, "X", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("1", value);
|
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
namespace {
|
|
|
|
// cmt_delay_ms is the delay between prepare and commit
|
|
|
|
// first_id is the id of the first transaction
|
|
|
|
Status TransactionStressTestInserter(
|
|
|
|
TransactionDB* db, const size_t num_transactions, const size_t num_sets,
|
|
|
|
const size_t num_keys_per_set, Random64* rand,
|
|
|
|
const uint64_t cmt_delay_ms = 0, const uint64_t first_id = 0) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
|
|
|
|
|
|
|
|
// Inside the inserter we might also retake the snapshot. We do both since two
|
|
|
|
// separte functions are engaged for each.
|
|
|
|
txn_options.set_snapshot = rand->OneIn(2);
|
|
|
|
|
|
|
|
RandomTransactionInserter inserter(
|
|
|
|
rand, write_options, read_options, num_keys_per_set,
|
|
|
|
static_cast<uint16_t>(num_sets), cmt_delay_ms, first_id);
|
|
|
|
|
|
|
|
for (size_t t = 0; t < num_transactions; t++) {
|
|
|
|
bool success = inserter.TransactionDBInsert(db, txn_options);
|
|
|
|
if (!success) {
|
|
|
|
// unexpected failure
|
|
|
|
return inserter.GetLastStatus();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
inserter.GetLastStatus().PermitUncheckedError();
|
|
|
|
|
|
|
|
// Make sure at least some of the transactions succeeded. It's ok if
|
|
|
|
// some failed due to write-conflicts.
|
|
|
|
if (num_transactions != 1 &&
|
|
|
|
inserter.GetFailureCount() > num_transactions / 2) {
|
|
|
|
return Status::TryAgain("Too many transactions failed! " +
|
|
|
|
std::to_string(inserter.GetFailureCount()) + " / " +
|
|
|
|
std::to_string(num_transactions));
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Worker threads add a number to a key from each set of keys. The checker
|
|
|
|
// threads verify that the sum of all keys in each set are equal.
|
|
|
|
TEST_P(MySQLStyleTransactionTest, TransactionStressTest) {
|
|
|
|
// Small write buffer to trigger more compactions
|
|
|
|
options.write_buffer_size = 1024;
|
|
|
|
txn_db_options.rollback_deletion_type_callback =
|
|
|
|
[](TransactionDB*, ColumnFamilyHandle*, const Slice& key) {
|
|
|
|
return RandomTransactionInserter::RollbackDeletionTypeCallback(key);
|
|
|
|
};
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
constexpr size_t num_workers = 4; // worker threads count
|
|
|
|
constexpr size_t num_checkers = 2; // checker threads count
|
|
|
|
constexpr size_t num_slow_checkers = 2; // checker threads emulating backups
|
|
|
|
constexpr size_t num_slow_workers = 1; // slow worker threads count
|
|
|
|
constexpr size_t num_transactions_per_thread = 1000;
|
|
|
|
constexpr uint16_t num_sets = 3;
|
|
|
|
constexpr size_t num_keys_per_set = 100;
|
|
|
|
// Setting the key-space to be 100 keys should cause enough write-conflicts
|
|
|
|
// to make this test interesting.
|
|
|
|
|
|
|
|
std::vector<port::Thread> threads;
|
|
|
|
std::atomic<uint32_t> finished = {0};
|
|
|
|
constexpr bool TAKE_SNAPSHOT = true;
|
|
|
|
uint64_t time_seed = env->NowMicros();
|
|
|
|
printf("time_seed is %" PRIu64 "\n", time_seed); // would help to reproduce
|
|
|
|
|
|
|
|
std::function<void()> call_inserter = [&] {
|
|
|
|
size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
|
|
|
|
Random64 rand(time_seed * thd_seed);
|
|
|
|
ASSERT_OK(TransactionStressTestInserter(db, num_transactions_per_thread,
|
|
|
|
num_sets, num_keys_per_set, &rand));
|
|
|
|
finished++;
|
|
|
|
};
|
|
|
|
std::function<void()> call_checker = [&] {
|
|
|
|
size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
|
|
|
|
Random64 rand(time_seed * thd_seed);
|
|
|
|
// Verify that data is consistent
|
|
|
|
while (finished < num_workers) {
|
|
|
|
ASSERT_OK(RandomTransactionInserter::Verify(
|
|
|
|
db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
std::function<void()> call_slow_checker = [&] {
|
|
|
|
size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
|
|
|
|
Random64 rand(time_seed * thd_seed);
|
|
|
|
// Verify that data is consistent
|
|
|
|
while (finished < num_workers) {
|
|
|
|
uint64_t delay_ms = rand.Uniform(100) + 1;
|
|
|
|
Status s = RandomTransactionInserter::Verify(
|
|
|
|
db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand, delay_ms);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
std::function<void()> call_slow_inserter = [&] {
|
|
|
|
size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
|
|
|
|
Random64 rand(time_seed * thd_seed);
|
|
|
|
uint64_t id = 0;
|
|
|
|
// Verify that data is consistent
|
|
|
|
while (finished < num_workers) {
|
|
|
|
uint64_t delay_ms = rand.Uniform(500) + 1;
|
|
|
|
ASSERT_OK(TransactionStressTestInserter(db, 1, num_sets, num_keys_per_set,
|
|
|
|
&rand, delay_ms, id++));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < num_workers; i++) {
|
|
|
|
threads.emplace_back(call_inserter);
|
|
|
|
}
|
|
|
|
for (uint32_t i = 0; i < num_checkers; i++) {
|
|
|
|
threads.emplace_back(call_checker);
|
|
|
|
}
|
|
|
|
if (with_slow_threads_) {
|
|
|
|
for (uint32_t i = 0; i < num_slow_checkers; i++) {
|
|
|
|
threads.emplace_back(call_slow_checker);
|
|
|
|
}
|
|
|
|
for (uint32_t i = 0; i < num_slow_workers; i++) {
|
|
|
|
threads.emplace_back(call_slow_inserter);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for all threads to finish
|
|
|
|
for (auto& t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify that data is consistent
|
|
|
|
Status s = RandomTransactionInserter::Verify(db, num_sets, num_keys_per_set,
|
|
|
|
!TAKE_SNAPSHOT);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, MemoryLimitTest) {
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
// Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data.
|
|
|
|
txn_options.max_write_batch_size = 29;
|
|
|
|
// Set threshold to unlimited so that the write batch does not get flushed,
|
|
|
|
// and can hit the memory limit.
|
|
|
|
txn_options.write_batch_flush_threshold = 0;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
Transaction* txn = db->BeginTransaction(WriteOptions(), txn_options);
|
|
|
|
ASSERT_TRUE(txn);
|
|
|
|
|
|
|
|
ASSERT_EQ(0, txn->GetNumPuts());
|
|
|
|
ASSERT_LE(0, txn->GetID());
|
|
|
|
|
|
|
|
s = txn->Put(Slice("a"), Slice("...."));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, txn->GetNumPuts());
|
|
|
|
|
|
|
|
s = txn->Put(Slice("b"), Slice("...."));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(2, txn->GetNumPuts());
|
|
|
|
|
|
|
|
s = txn->Put(Slice("b"), Slice("...."));
|
|
|
|
ASSERT_TRUE(s.IsMemoryLimit());
|
|
|
|
ASSERT_EQ(2, txn->GetNumPuts());
|
|
|
|
|
|
|
|
ASSERT_OK(txn->Rollback());
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
// This test clarifies the existing expectation from the sequence number
|
|
|
|
// algorithm. It could detect mistakes in updating the code but it is not
|
|
|
|
// necessarily the one acceptable way. If the algorithm is legitimately changed,
|
|
|
|
// this unit test should be updated as well.
|
|
|
|
TEST_P(TransactionStressTest, SeqAdvanceTest) {
|
|
|
|
// TODO(myabandeh): must be test with false before new releases
|
|
|
|
const bool short_test = true;
|
|
|
|
WriteOptions wopts;
|
|
|
|
FlushOptions fopt;
|
|
|
|
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
|
|
|
|
// Do the test with NUM_BRANCHES branches in it. Each run of a test takes some
|
|
|
|
// of the branches. This is the same as counting a binary number where i-th
|
|
|
|
// bit represents whether we take branch i in the represented by the number.
|
|
|
|
const size_t NUM_BRANCHES = short_test ? 6 : 10;
|
|
|
|
// Helper function that shows if the branch is to be taken in the run
|
|
|
|
// represented by the number n.
|
|
|
|
auto branch_do = [&](size_t n, size_t* branch) {
|
|
|
|
assert(*branch < NUM_BRANCHES);
|
|
|
|
const size_t filter = static_cast<size_t>(1) << *branch;
|
|
|
|
return n & filter;
|
|
|
|
};
|
|
|
|
const size_t max_n = static_cast<size_t>(1) << NUM_BRANCHES;
|
|
|
|
for (size_t n = 0; n < max_n; n++) {
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
size_t branch = 0;
|
|
|
|
auto seq = db_impl->GetLatestSequenceNumber();
|
|
|
|
exp_seq = seq;
|
|
|
|
TestTxn0(0);
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
|
|
|
|
if (branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->Flush(fopt));
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
if (!short_test && branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->FlushWAL(true));
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
seq = db_impl->GetLatestSequenceNumber();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Doing it twice might detect some bugs
|
|
|
|
TestTxn0(1);
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
|
|
|
|
TestTxn1(0);
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
|
|
|
|
if (branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->Flush(fopt));
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
if (!short_test && branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->FlushWAL(true));
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
seq = db_impl->GetLatestSequenceNumber();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
TestTxn3(0);
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
|
|
|
|
if (branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->Flush(fopt));
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
if (!short_test && branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->FlushWAL(true));
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
seq = db_impl->GetLatestSequenceNumber();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
TestTxn4(0);
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
|
|
|
|
if (branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->Flush(fopt));
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
if (!short_test && branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->FlushWAL(true));
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
seq = db_impl->GetLatestSequenceNumber();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
TestTxn2(0);
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
|
|
|
|
if (branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->Flush(fopt));
|
|
|
|
seq = db_impl->TEST_GetLastVisibleSequence();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
if (!short_test && branch_do(n, &branch)) {
|
|
|
|
ASSERT_OK(db_impl->FlushWAL(true));
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
seq = db_impl->GetLatestSequenceNumber();
|
|
|
|
ASSERT_EQ(exp_seq, seq);
|
|
|
|
}
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
// Verify that the optimization would not compromize the correctness
|
|
|
|
TEST_P(TransactionTest, Optimizations) {
|
|
|
|
size_t comb_cnt = size_t(1) << 2; // 2 is number of optimization vars
|
|
|
|
for (size_t new_comb = 0; new_comb < comb_cnt; new_comb++) {
|
|
|
|
TransactionDBWriteOptimizations optimizations;
|
|
|
|
optimizations.skip_concurrency_control = IsInCombination(0, new_comb);
|
|
|
|
optimizations.skip_duplicate_key_check = IsInCombination(1, new_comb);
|
|
|
|
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
WriteOptions write_options;
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(batch.Put(Slice("k"), Slice("v1")));
|
|
|
|
ASSERT_OK(db->Write(write_options, &batch));
|
|
|
|
|
|
|
|
ReadOptions ropt;
|
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "k", &pinnable_val));
|
|
|
|
ASSERT_TRUE(pinnable_val == ("v1"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// A comparator that uses only the first three bytes
|
|
|
|
class ThreeBytewiseComparator : public Comparator {
|
|
|
|
public:
|
|
|
|
ThreeBytewiseComparator() {}
|
|
|
|
const char* Name() const override { return "test.ThreeBytewiseComparator"; }
|
|
|
|
int Compare(const Slice& a, const Slice& b) const override {
|
|
|
|
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
|
|
|
|
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
|
|
|
|
return na.compare(nb);
|
|
|
|
}
|
|
|
|
bool Equal(const Slice& a, const Slice& b) const override {
|
|
|
|
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
|
|
|
|
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
|
|
|
|
return na == nb;
|
|
|
|
}
|
|
|
|
// These methods below don't seem relevant to this test. Implement them if
|
|
|
|
// proven othersize.
|
|
|
|
void FindShortestSeparator(std::string* start,
|
|
|
|
const Slice& limit) const override {
|
|
|
|
const Comparator* bytewise_comp = BytewiseComparator();
|
|
|
|
bytewise_comp->FindShortestSeparator(start, limit);
|
|
|
|
}
|
|
|
|
void FindShortSuccessor(std::string* key) const override {
|
|
|
|
const Comparator* bytewise_comp = BytewiseComparator();
|
|
|
|
bytewise_comp->FindShortSuccessor(key);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
TEST_P(TransactionTest, GetWithoutSnapshot) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
std::atomic<bool> finish = {false};
|
|
|
|
ASSERT_OK(db->Put(write_options, "key", "value"));
|
|
|
|
ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn->SetName("xid"));
|
|
|
|
ASSERT_OK(txn->Put("key", "overridedvalue"));
|
|
|
|
ASSERT_OK(txn->Put("key", "value"));
|
|
|
|
ASSERT_OK(txn->Prepare());
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
finish = true;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
|
|
|
|
while (!finish) {
|
|
|
|
ReadOptions ropt;
|
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val));
|
|
|
|
ASSERT_TRUE(pinnable_val == ("value"));
|
|
|
|
}
|
|
|
|
});
|
|
|
|
commit_thread.join();
|
|
|
|
read_thread.join();
|
|
|
|
}
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
// Test that the transactional db can handle duplicate keys in the write batch
|
|
|
|
TEST_P(TransactionTest, DuplicateKeys) {
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
std::string cf_name = "two";
|
|
|
|
ColumnFamilyHandle* cf_handle = nullptr;
|
|
|
|
{
|
|
|
|
ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
|
|
|
|
WriteOptions write_options;
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
|
|
|
|
ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
|
|
|
|
// duplicate the keys
|
|
|
|
ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
|
|
|
|
// duplicate the 2nd key. It should not be counted duplicate since a
|
|
|
|
// sub-patch is cut after the last duplicate.
|
|
|
|
ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
|
|
|
|
// duplicate the keys but in a different cf. It should not be counted as
|
|
|
|
// duplicate keys
|
|
|
|
ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
|
|
|
|
|
|
|
|
ASSERT_OK(db->Write(write_options, &batch));
|
|
|
|
|
|
|
|
ReadOptions ropt;
|
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("value3"));
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "key2", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("value4"));
|
|
|
|
s = db->Get(ropt, cf_handle, "key", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("value5"));
|
|
|
|
|
|
|
|
delete cf_handle;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test with non-bytewise comparator
|
|
|
|
{
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
|
|
|
|
cf_options.comparator = comp_gc.get();
|
|
|
|
ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
|
|
|
|
WriteOptions write_options;
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value")));
|
|
|
|
// The first three bytes are the same, do it must be counted as duplicate
|
|
|
|
ASSERT_OK(batch.Put(cf_handle, Slice("key2"), Slice("value2")));
|
|
|
|
// check for 2nd duplicate key in cf with non-default comparator
|
|
|
|
ASSERT_OK(batch.Put(cf_handle, Slice("key2b"), Slice("value2b")));
|
|
|
|
ASSERT_OK(db->Write(write_options, &batch));
|
|
|
|
|
|
|
|
// The value must be the most recent value for all the keys equal to "key",
|
|
|
|
// including "key2"
|
|
|
|
ReadOptions ropt;
|
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val));
|
|
|
|
ASSERT_TRUE(pinnable_val == ("value2b"));
|
|
|
|
|
|
|
|
// Test duplicate keys with rollback
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Put(cf_handle, Slice("key3"), Slice("value3")));
|
|
|
|
ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4")));
|
|
|
|
ASSERT_OK(txn0->Rollback());
|
|
|
|
ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val));
|
|
|
|
ASSERT_TRUE(pinnable_val == ("value2b"));
|
|
|
|
delete txn0;
|
|
|
|
|
|
|
|
delete cf_handle;
|
|
|
|
cf_options.comparator = BytewiseComparator();
|
|
|
|
}
|
|
|
|
|
|
|
|
for (bool do_prepare : {true, false}) {
|
|
|
|
for (bool do_rollback : {true, false}) {
|
|
|
|
for (bool with_commit_batch : {true, false}) {
|
|
|
|
if (with_commit_batch && !do_prepare) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (with_commit_batch && do_rollback) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
|
|
|
|
WriteOptions write_options;
|
|
|
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
auto s = txn0->SetName("xid");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Put(Slice("foo0"), Slice("bar0a"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Put(Slice("foo0"), Slice("bar0b"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Put(Slice("foo1"), Slice("bar1"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Merge(Slice("foo2"), Slice("bar2a"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// Repeat a key after the start of a sub-patch. This should not cause a
|
|
|
|
// duplicate in the most recent sub-patch and hence not creating a new
|
|
|
|
// sub-patch.
|
|
|
|
s = txn0->Put(Slice("foo0"), Slice("bar0c"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Merge(Slice("foo2"), Slice("bar2b"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// duplicate the keys but in a different cf. It should not be counted as
|
|
|
|
// duplicate.
|
|
|
|
s = txn0->Put(cf_handle, Slice("foo0"), Slice("bar0-cf1"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Put(Slice("foo3"), Slice("bar3"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Merge(Slice("foo3"), Slice("bar3"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Put(Slice("foo4"), Slice("bar4"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->Delete(Slice("foo4"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = txn0->SingleDelete(Slice("foo4"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
if (do_prepare) {
|
|
|
|
s = txn0->Prepare();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
if (do_rollback) {
|
|
|
|
// Test rolling back the batch with duplicates
|
|
|
|
s = txn0->Rollback();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
} else {
|
|
|
|
if (with_commit_batch) {
|
|
|
|
assert(do_prepare);
|
|
|
|
auto cb = txn0->GetCommitTimeWriteBatch();
|
|
|
|
// duplicate a key in the original batch
|
|
|
|
// TODO(myabandeh): the behavior of GetCommitTimeWriteBatch
|
|
|
|
// conflicting with the prepared batch is currently undefined and
|
|
|
|
// gives different results in different implementations.
|
|
|
|
|
|
|
|
// s = cb->Put(Slice("foo0"), Slice("bar0d"));
|
|
|
|
// ASSERT_OK(s);
|
|
|
|
// add a new duplicate key
|
|
|
|
s = cb->Put(Slice("foo6"), Slice("bar6a"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = cb->Put(Slice("foo6"), Slice("bar6b"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// add a duplicate key that is removed in the same batch
|
|
|
|
s = cb->Put(Slice("foo7"), Slice("bar7a"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = cb->Delete(Slice("foo7"));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
s = txn0->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
delete txn0;
|
|
|
|
ReadOptions ropt;
|
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
|
|
|
|
if (do_rollback) {
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
} else {
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar0c"));
|
|
|
|
s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar0-cf1"));
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar1"));
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar2a,bar2b"));
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar3,bar3"));
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
if (with_commit_batch) {
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo6", &pinnable_val);
|
|
|
|
if (txn_db_options.write_policy ==
|
|
|
|
TxnDBWritePolicy::WRITE_COMMITTED) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar6b"));
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo7", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
delete cf_handle;
|
|
|
|
} // with_commit_batch
|
|
|
|
} // do_rollback
|
|
|
|
} // do_prepare
|
|
|
|
|
|
|
|
if (!options.unordered_write) {
|
|
|
|
// Also test with max_successive_merges > 0. max_successive_merges will not
|
|
|
|
// affect our algorithm for duplicate key insertion but we add the test to
|
|
|
|
// verify that.
|
|
|
|
cf_options.max_successive_merges = 2;
|
|
|
|
cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
db->CreateColumnFamily(cf_options, cf_name, &cf_handle);
|
|
|
|
WriteOptions write_options;
|
|
|
|
// Ensure one value for the key
|
|
|
|
ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value")));
|
|
|
|
WriteBatch batch;
|
|
|
|
// Merge more than max_successive_merges times
|
|
|
|
ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("1")));
|
|
|
|
ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("2")));
|
|
|
|
ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("3")));
|
|
|
|
ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4")));
|
|
|
|
ASSERT_OK(db->Write(write_options, &batch));
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(db->Get(read_options, cf_handle, "key", &value));
|
|
|
|
ASSERT_EQ(value, "value,1,2,3,4");
|
|
|
|
delete cf_handle;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Test that the duplicate detection is not compromised after rolling back
|
|
|
|
// to a save point
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
WriteOptions write_options;
|
|
|
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
|
|
|
|
txn0->SetSavePoint();
|
|
|
|
ASSERT_OK(txn0->RollbackToSavePoint());
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test sucessfull recovery after a crash
|
|
|
|
{
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions ropt;
|
|
|
|
Transaction* txn0;
|
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
|
|
|
|
cf_options.comparator = comp_gc.get();
|
|
|
|
cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
|
|
|
|
delete cf_handle;
|
|
|
|
std::vector<ColumnFamilyDescriptor> cfds{
|
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName,
|
|
|
|
ColumnFamilyOptions(options)),
|
|
|
|
ColumnFamilyDescriptor(cf_name, cf_options),
|
|
|
|
};
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
|
|
|
|
|
|
|
|
assert(db != nullptr);
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo0", "init"));
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo1", "init"));
|
|
|
|
ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init"));
|
|
|
|
ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init"));
|
|
|
|
|
|
|
|
// one entry
|
|
|
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
delete txn0;
|
|
|
|
// This will check the asserts inside recovery code
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
|
|
|
|
txn0 = db->GetTransactionByName("xid");
|
|
|
|
ASSERT_TRUE(txn0 != nullptr);
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar0a"));
|
|
|
|
|
|
|
|
// two entries, no duplicate
|
|
|
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b")));
|
|
|
|
ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b")));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
delete txn0;
|
|
|
|
// This will check the asserts inside recovery code
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
// Flush only cf 1
|
|
|
|
ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
|
|
|
|
->TEST_FlushMemTable(true, false, handles[1]));
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
|
|
|
|
txn0 = db->GetTransactionByName("xid");
|
|
|
|
ASSERT_TRUE(txn0 != nullptr);
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar0b"));
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar1b"));
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, handles[1], "foo0", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar0b"));
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, handles[1], "fol1", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar1b"));
|
|
|
|
|
|
|
|
// one duplicate with ::Put
|
|
|
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c")));
|
|
|
|
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d")));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
delete txn0;
|
|
|
|
// This will check the asserts inside recovery code
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
// Flush only cf 1
|
|
|
|
ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
|
|
|
|
->TEST_FlushMemTable(true, false, handles[1]));
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
|
|
|
|
txn0 = db->GetTransactionByName("xid");
|
|
|
|
ASSERT_TRUE(txn0 != nullptr);
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar0d"));
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar1c"));
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar1d"));
|
|
|
|
|
|
|
|
// Duplicate with ::Put, ::Delete
|
|
|
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e")));
|
|
|
|
ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
|
|
|
|
ASSERT_OK(txn0->Delete(Slice("foo0")));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
delete txn0;
|
|
|
|
// This will check the asserts inside recovery code
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
// Flush only cf 1
|
|
|
|
ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
|
|
|
|
->TEST_FlushMemTable(true, false, handles[1]));
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
|
|
|
|
txn0 = db->GetTransactionByName("xid");
|
|
|
|
ASSERT_TRUE(txn0 != nullptr);
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Duplicate with ::Put, ::SingleDelete
|
|
|
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g")));
|
|
|
|
ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
|
|
|
|
ASSERT_OK(txn0->SingleDelete(Slice("foo0")));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
delete txn0;
|
|
|
|
// This will check the asserts inside recovery code
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
// Flush only cf 1
|
|
|
|
ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
|
|
|
|
->TEST_FlushMemTable(true, false, handles[1]));
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
|
|
|
|
txn0 = db->GetTransactionByName("xid");
|
|
|
|
ASSERT_TRUE(txn0 != nullptr);
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Duplicate with ::Put, ::Merge
|
|
|
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i")));
|
|
|
|
ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j")));
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f")));
|
|
|
|
ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g")));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
delete txn0;
|
|
|
|
// This will check the asserts inside recovery code
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
// Flush only cf 1
|
|
|
|
ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
|
|
|
|
->TEST_FlushMemTable(true, false, handles[1]));
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
|
|
|
|
txn0 = db->GetTransactionByName("xid");
|
|
|
|
ASSERT_TRUE(txn0 != nullptr);
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
delete txn0;
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar0f,bar0g"));
|
|
|
|
pinnable_val.Reset();
|
|
|
|
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(pinnable_val == ("bar1i,bar1j"));
|
|
|
|
|
|
|
|
for (auto h : handles) {
|
|
|
|
delete h;
|
|
|
|
}
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that the reseek optimization in iterators will not result in an infinite
|
|
|
|
// loop if there are too many uncommitted entries before the snapshot.
|
|
|
|
TEST_P(TransactionTest, ReseekOptimization) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = true;
|
|
|
|
write_options.disableWAL = false;
|
|
|
|
ColumnFamilyDescriptor cfd;
|
|
|
|
ASSERT_OK(db->DefaultColumnFamily()->GetDescriptor(&cfd));
|
|
|
|
auto max_skip = cfd.options.max_sequential_skip_in_iterations;
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv")));
|
|
|
|
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
// Duplicate keys will result into separate sequence numbers in WritePrepared
|
|
|
|
// and WriteUnPrepared
|
|
|
|
for (size_t i = 0; i < 2 * max_skip; i++) {
|
|
|
|
ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar")));
|
|
|
|
}
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("initv")));
|
|
|
|
|
|
|
|
ReadOptions read_options;
|
|
|
|
// To avoid loops
|
|
|
|
read_options.max_skippable_internal_keys = 10 * max_skip;
|
|
|
|
Iterator* iter = db->NewIterator(read_options);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
size_t cnt = 0;
|
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
cnt++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(cnt, 2);
|
|
|
|
cnt = 0;
|
|
|
|
iter->SeekToLast();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
cnt++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(cnt, 2);
|
|
|
|
delete iter;
|
|
|
|
ASSERT_OK(txn0->Rollback());
|
|
|
|
delete txn0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// After recovery in kPointInTimeRecovery mode, the corrupted log file remains
|
|
|
|
// there. The new log files should be still read succesfully during recovery of
|
|
|
|
// the 2nd crash.
|
|
|
|
TEST_P(TransactionTest, DoubleCrashInRecovery) {
|
|
|
|
for (const bool manual_wal_flush : {false, true}) {
|
|
|
|
for (const bool write_after_recovery : {false, true}) {
|
|
|
|
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
|
|
|
options.manual_wal_flush = manual_wal_flush;
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
std::string cf_name = "two";
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
ColumnFamilyHandle* cf_handle = nullptr;
|
|
|
|
ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
|
|
|
|
|
|
|
|
// Add a prepare entry to prevent the older logs from being deleted.
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn->SetName("xid"));
|
|
|
|
ASSERT_OK(txn->Put(Slice("foo-prepare"), Slice("bar-prepare")));
|
|
|
|
ASSERT_OK(txn->Prepare());
|
|
|
|
|
|
|
|
FlushOptions flush_ops;
|
|
|
|
ASSERT_OK(db->Flush(flush_ops));
|
|
|
|
// Now we have a log that cannot be deleted
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(write_options, cf_handle, "foo1", "bar1"));
|
|
|
|
// Flush only the 2nd cf
|
|
|
|
ASSERT_OK(db->Flush(flush_ops, cf_handle));
|
|
|
|
|
|
|
|
// The value is large enough to be touched by the corruption we ingest
|
|
|
|
// below.
|
|
|
|
std::string large_value(400, ' ');
|
|
|
|
// key/value not touched by corruption
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo2", "bar2"));
|
|
|
|
// key/value touched by corruption
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo3", large_value));
|
|
|
|
// key/value not touched by corruption
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo4", "bar4"));
|
|
|
|
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
uint64_t wal_file_id = db_impl->TEST_LogfileNumber();
|
|
|
|
std::string fname = LogFileName(dbname, wal_file_id);
|
|
|
|
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
|
|
|
|
delete txn;
|
|
|
|
delete cf_handle;
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
|
|
|
|
// Corrupt the last log file in the middle, so that it is not corrupted
|
|
|
|
// in the tail.
|
|
|
|
std::string file_content;
|
|
|
|
ASSERT_OK(ReadFileToString(env, fname, &file_content));
|
|
|
|
file_content[400] = 'h';
|
|
|
|
file_content[401] = 'a';
|
|
|
|
ASSERT_OK(env->DeleteFile(fname));
|
|
|
|
ASSERT_OK(WriteStringToFile(env, file_content, fname, true));
|
|
|
|
|
|
|
|
// Recover from corruption
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
column_families.push_back(ColumnFamilyDescriptor(kDefaultColumnFamilyName,
|
|
|
|
ColumnFamilyOptions()));
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor("two", ColumnFamilyOptions()));
|
|
|
|
ASSERT_OK(ReOpenNoDelete(column_families, &handles));
|
|
|
|
assert(db != nullptr);
|
|
|
|
|
|
|
|
if (write_after_recovery) {
|
|
|
|
// Write data to the log right after the corrupted log
|
|
|
|
ASSERT_OK(db->Put(write_options, "foo5", large_value));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Persist data written to WAL during recovery or by the last Put
|
|
|
|
ASSERT_OK(db->FlushWAL(true));
|
|
|
|
// 2nd crash to recover while having a valid log after the corrupted one.
|
|
|
|
ASSERT_OK(ReOpenNoDelete(column_families, &handles));
|
|
|
|
assert(db != nullptr);
|
|
|
|
txn = db->GetTransactionByName("xid");
|
|
|
|
ASSERT_TRUE(txn != nullptr);
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
delete txn;
|
|
|
|
for (auto handle : handles) {
|
|
|
|
delete handle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, CommitWithoutPrepare) {
|
|
|
|
{
|
|
|
|
// skip_prepare = false.
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.skip_prepare = false;
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_TRUE(txn->Commit().IsTxnNotPrepared());
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// skip_prepare = true.
|
|
|
|
WriteOptions write_options;
|
|
|
|
TransactionOptions txn_options;
|
|
|
|
txn_options.skip_prepare = true;
|
|
|
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, OpenAndEnableU64Timestamp) {
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
|
|
|
|
assert(db);
|
|
|
|
|
|
|
|
const std::string test_cf_name = "test_cf";
|
|
|
|
ColumnFamilyOptions cf_opts;
|
|
|
|
cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
{
|
|
|
|
ColumnFamilyHandle* cfh = nullptr;
|
Support user-defined timestamps in write-committed txns (#9629)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9629
Pessimistic transactions use pessimistic concurrency control, i.e. locking. Keys are
locked upon first operation that writes the key or has the intention of writing. For example,
`PessimisticTransaction::Put()`, `PessimisticTransaction::Delete()`,
`PessimisticTransaction::SingleDelete()` will write to or delete a key, while
`PessimisticTransaction::GetForUpdate()` is used by application to indicate
to RocksDB that the transaction has the intention of performing write operation later
in the same transaction.
Pessimistic transactions support two-phase commit (2PC). A transaction can be
`Prepared()`'ed and then `Commit()`. The prepare phase is similar to a promise: once
`Prepare()` succeeds, the transaction has acquired the necessary resources to commit.
The resources include locks, persistence of WAL, etc.
Write-committed transaction is the default pessimistic transaction implementation. In
RocksDB write-committed transaction, `Prepare()` will write data to the WAL as a prepare
section. `Commit()` will write a commit marker to the WAL and then write data to the
memtables. While writing to the memtables, different keys in the transaction's write batch
will be assigned different sequence numbers in ascending order.
Until commit/rollback, the transaction holds locks on the keys so that no other transaction
can write to the same keys. Furthermore, the keys' sequence numbers represent the order
in which they are committed and should be made visible. This is convenient for us to
implement support for user-defined timestamps.
Since column families with and without timestamps can co-exist in the same database,
a transaction may or may not involve timestamps. Based on this observation, we add two
optional members to each `PessimisticTransaction`, `read_timestamp_` and
`commit_timestamp_`. If no key in the transaction's write batch has timestamp, then
setting these two variables do not have any effect. For the rest of this commit, we discuss
only the cases when these two variables are meaningful.
read_timestamp_ is used mainly for validation, and should be set before first call to
`GetForUpdate()`. Otherwise, the latter will return non-ok status. `GetForUpdate()` calls
`TryLock()` that can verify if another transaction has written the same key since
`read_timestamp_` till this call to `GetForUpdate()`. If another transaction has indeed
written the same key, then validation fails, and RocksDB allows this transaction to
refine `read_timestamp_` by increasing it. Note that a transaction can still use `Get()`
with a different timestamp to read, but the result of the read should not be used to
determine data that will be written later.
commit_timestamp_ must be set after finishing writing and before transaction commit.
This applies to both 2PC and non-2PC cases. In the case of 2PC, it's usually set after
prepare phase succeeds.
We currently require that the commit timestamp be chosen after all keys are locked. This
means we disallow the `TransactionDB`-level APIs if user-defined timestamp is used
by the transaction. Specifically, calling `PessimisticTransactionDB::Put()`,
`PessimisticTransactionDB::Delete()`, `PessimisticTransactionDB::SingleDelete()`,
etc. will return non-ok status because they specify timestamps before locking the keys.
Users are also prompted to use the `Transaction` APIs when they receive the non-ok status.
Reviewed By: ltamasi
Differential Revision: D31822445
fbshipit-source-id: b82abf8e230216dc89cc519564a588224a88fd43
3 years ago
|
|
|
const Status s = db->CreateColumnFamily(cf_opts, test_cf_name, &cfh);
|
|
|
|
if (txn_db_options.write_policy == WRITE_COMMITTED) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete cfh;
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(s.IsNotSupported());
|
|
|
|
assert(!cfh);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Bypass transaction db layer.
|
Support user-defined timestamps in write-committed txns (#9629)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9629
Pessimistic transactions use pessimistic concurrency control, i.e. locking. Keys are
locked upon first operation that writes the key or has the intention of writing. For example,
`PessimisticTransaction::Put()`, `PessimisticTransaction::Delete()`,
`PessimisticTransaction::SingleDelete()` will write to or delete a key, while
`PessimisticTransaction::GetForUpdate()` is used by application to indicate
to RocksDB that the transaction has the intention of performing write operation later
in the same transaction.
Pessimistic transactions support two-phase commit (2PC). A transaction can be
`Prepared()`'ed and then `Commit()`. The prepare phase is similar to a promise: once
`Prepare()` succeeds, the transaction has acquired the necessary resources to commit.
The resources include locks, persistence of WAL, etc.
Write-committed transaction is the default pessimistic transaction implementation. In
RocksDB write-committed transaction, `Prepare()` will write data to the WAL as a prepare
section. `Commit()` will write a commit marker to the WAL and then write data to the
memtables. While writing to the memtables, different keys in the transaction's write batch
will be assigned different sequence numbers in ascending order.
Until commit/rollback, the transaction holds locks on the keys so that no other transaction
can write to the same keys. Furthermore, the keys' sequence numbers represent the order
in which they are committed and should be made visible. This is convenient for us to
implement support for user-defined timestamps.
Since column families with and without timestamps can co-exist in the same database,
a transaction may or may not involve timestamps. Based on this observation, we add two
optional members to each `PessimisticTransaction`, `read_timestamp_` and
`commit_timestamp_`. If no key in the transaction's write batch has timestamp, then
setting these two variables do not have any effect. For the rest of this commit, we discuss
only the cases when these two variables are meaningful.
read_timestamp_ is used mainly for validation, and should be set before first call to
`GetForUpdate()`. Otherwise, the latter will return non-ok status. `GetForUpdate()` calls
`TryLock()` that can verify if another transaction has written the same key since
`read_timestamp_` till this call to `GetForUpdate()`. If another transaction has indeed
written the same key, then validation fails, and RocksDB allows this transaction to
refine `read_timestamp_` by increasing it. Note that a transaction can still use `Get()`
with a different timestamp to read, but the result of the read should not be used to
determine data that will be written later.
commit_timestamp_ must be set after finishing writing and before transaction commit.
This applies to both 2PC and non-2PC cases. In the case of 2PC, it's usually set after
prepare phase succeeds.
We currently require that the commit timestamp be chosen after all keys are locked. This
means we disallow the `TransactionDB`-level APIs if user-defined timestamp is used
by the transaction. Specifically, calling `PessimisticTransactionDB::Put()`,
`PessimisticTransactionDB::Delete()`, `PessimisticTransactionDB::SingleDelete()`,
etc. will return non-ok status because they specify timestamps before locking the keys.
Users are also prompted to use the `Transaction` APIs when they receive the non-ok status.
Reviewed By: ltamasi
Differential Revision: D31822445
fbshipit-source-id: b82abf8e230216dc89cc519564a588224a88fd43
3 years ago
|
|
|
if (txn_db_options.write_policy != WRITE_COMMITTED) {
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
assert(db_impl);
|
|
|
|
ColumnFamilyHandle* cfh = nullptr;
|
|
|
|
ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
|
|
|
|
delete cfh;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
|
|
|
cf_descs.emplace_back(kDefaultColumnFamilyName, options);
|
|
|
|
cf_descs.emplace_back(test_cf_name, cf_opts);
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
Support user-defined timestamps in write-committed txns (#9629)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9629
Pessimistic transactions use pessimistic concurrency control, i.e. locking. Keys are
locked upon first operation that writes the key or has the intention of writing. For example,
`PessimisticTransaction::Put()`, `PessimisticTransaction::Delete()`,
`PessimisticTransaction::SingleDelete()` will write to or delete a key, while
`PessimisticTransaction::GetForUpdate()` is used by application to indicate
to RocksDB that the transaction has the intention of performing write operation later
in the same transaction.
Pessimistic transactions support two-phase commit (2PC). A transaction can be
`Prepared()`'ed and then `Commit()`. The prepare phase is similar to a promise: once
`Prepare()` succeeds, the transaction has acquired the necessary resources to commit.
The resources include locks, persistence of WAL, etc.
Write-committed transaction is the default pessimistic transaction implementation. In
RocksDB write-committed transaction, `Prepare()` will write data to the WAL as a prepare
section. `Commit()` will write a commit marker to the WAL and then write data to the
memtables. While writing to the memtables, different keys in the transaction's write batch
will be assigned different sequence numbers in ascending order.
Until commit/rollback, the transaction holds locks on the keys so that no other transaction
can write to the same keys. Furthermore, the keys' sequence numbers represent the order
in which they are committed and should be made visible. This is convenient for us to
implement support for user-defined timestamps.
Since column families with and without timestamps can co-exist in the same database,
a transaction may or may not involve timestamps. Based on this observation, we add two
optional members to each `PessimisticTransaction`, `read_timestamp_` and
`commit_timestamp_`. If no key in the transaction's write batch has timestamp, then
setting these two variables do not have any effect. For the rest of this commit, we discuss
only the cases when these two variables are meaningful.
read_timestamp_ is used mainly for validation, and should be set before first call to
`GetForUpdate()`. Otherwise, the latter will return non-ok status. `GetForUpdate()` calls
`TryLock()` that can verify if another transaction has written the same key since
`read_timestamp_` till this call to `GetForUpdate()`. If another transaction has indeed
written the same key, then validation fails, and RocksDB allows this transaction to
refine `read_timestamp_` by increasing it. Note that a transaction can still use `Get()`
with a different timestamp to read, but the result of the read should not be used to
determine data that will be written later.
commit_timestamp_ must be set after finishing writing and before transaction commit.
This applies to both 2PC and non-2PC cases. In the case of 2PC, it's usually set after
prepare phase succeeds.
We currently require that the commit timestamp be chosen after all keys are locked. This
means we disallow the `TransactionDB`-level APIs if user-defined timestamp is used
by the transaction. Specifically, calling `PessimisticTransactionDB::Put()`,
`PessimisticTransactionDB::Delete()`, `PessimisticTransactionDB::SingleDelete()`,
etc. will return non-ok status because they specify timestamps before locking the keys.
Users are also prompted to use the `Transaction` APIs when they receive the non-ok status.
Reviewed By: ltamasi
Differential Revision: D31822445
fbshipit-source-id: b82abf8e230216dc89cc519564a588224a88fd43
3 years ago
|
|
|
const Status s = ReOpenNoDelete(cf_descs, &handles);
|
|
|
|
if (txn_db_options.write_policy == WRITE_COMMITTED) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
for (auto* h : handles) {
|
|
|
|
delete h;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(s.IsNotSupported());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, OpenAndEnableU32Timestamp) {
|
|
|
|
class DummyComparatorWithU32Ts : public Comparator {
|
|
|
|
public:
|
|
|
|
DummyComparatorWithU32Ts() : Comparator(sizeof(uint32_t)) {}
|
|
|
|
const char* Name() const override { return "DummyComparatorWithU32Ts"; }
|
|
|
|
void FindShortSuccessor(std::string*) const override {}
|
|
|
|
void FindShortestSeparator(std::string*, const Slice&) const override {}
|
|
|
|
int Compare(const Slice&, const Slice&) const override { return 0; }
|
|
|
|
};
|
|
|
|
|
|
|
|
std::unique_ptr<Comparator> dummy_ucmp(new DummyComparatorWithU32Ts());
|
|
|
|
|
|
|
|
ASSERT_OK(ReOpenNoDelete());
|
|
|
|
|
|
|
|
assert(db);
|
|
|
|
|
|
|
|
const std::string test_cf_name = "test_cf";
|
|
|
|
|
|
|
|
ColumnFamilyOptions cf_opts;
|
|
|
|
cf_opts.comparator = dummy_ucmp.get();
|
|
|
|
{
|
|
|
|
ColumnFamilyHandle* cfh = nullptr;
|
|
|
|
ASSERT_TRUE(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)
|
|
|
|
.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Bypass transaction db layer.
|
|
|
|
{
|
|
|
|
ColumnFamilyHandle* cfh = nullptr;
|
|
|
|
DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
|
|
|
assert(db_impl);
|
|
|
|
ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
|
|
|
|
delete cfh;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
|
|
|
cf_descs.emplace_back(kDefaultColumnFamilyName, options);
|
|
|
|
cf_descs.emplace_back(test_cf_name, cf_opts);
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
ASSERT_TRUE(ReOpenNoDelete(cf_descs, &handles).IsInvalidArgument());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, WriteWithBulkCreatedColumnFamilies) {
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
WriteOptions write_options;
|
|
|
|
|
|
|
|
std::vector<std::string> cf_names;
|
|
|
|
std::vector<ColumnFamilyHandle*> cf_handles;
|
|
|
|
|
|
|
|
cf_names.push_back("test_cf");
|
|
|
|
|
|
|
|
ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
|
|
|
|
ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar"));
|
|
|
|
ASSERT_OK(db->DropColumnFamilies(cf_handles));
|
|
|
|
|
|
|
|
for (auto* h : cf_handles) {
|
|
|
|
delete h;
|
|
|
|
}
|
|
|
|
cf_handles.clear();
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descriptors;
|
|
|
|
|
|
|
|
cf_descriptors.emplace_back("test_cf", ColumnFamilyOptions());
|
|
|
|
|
|
|
|
ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
|
|
|
|
ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar"));
|
|
|
|
ASSERT_OK(db->DropColumnFamilies(cf_handles));
|
|
|
|
for (auto* h : cf_handles) {
|
|
|
|
delete h;
|
|
|
|
}
|
|
|
|
cf_handles.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(TransactionTest, LockWal) {
|
|
|
|
const TxnDBWritePolicy write_policy = std::get<2>(GetParam());
|
|
|
|
if (TxnDBWritePolicy::WRITE_COMMITTED != write_policy) {
|
|
|
|
ROCKSDB_GTEST_BYPASS("Test only write-committed for now");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ASSERT_OK(ReOpen());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"TransactionTest::LockWal:AfterLockWal",
|
|
|
|
"TransactionTest::LockWal:BeforePrepareTxn2"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
std::unique_ptr<Transaction> txn0;
|
|
|
|
WriteOptions wopts;
|
|
|
|
wopts.no_slowdown = true;
|
|
|
|
txn0.reset(db->BeginTransaction(wopts, TransactionOptions()));
|
|
|
|
ASSERT_OK(txn0->SetName("txn0"));
|
|
|
|
ASSERT_OK(txn0->Put("foo", "v0"));
|
|
|
|
|
|
|
|
std::unique_ptr<Transaction> txn1;
|
|
|
|
txn1.reset(db->BeginTransaction(wopts, TransactionOptions()));
|
|
|
|
ASSERT_OK(txn1->SetName("txn1"));
|
|
|
|
ASSERT_OK(txn1->Put("dummy", "v0"));
|
|
|
|
ASSERT_OK(txn1->Prepare());
|
|
|
|
|
|
|
|
std::unique_ptr<Transaction> txn2;
|
|
|
|
port::Thread worker([&]() {
|
|
|
|
txn2.reset(db->BeginTransaction(WriteOptions(), TransactionOptions()));
|
|
|
|
ASSERT_OK(txn2->SetName("txn2"));
|
|
|
|
ASSERT_OK(txn2->Put("bar", "v0"));
|
|
|
|
TEST_SYNC_POINT("TransactionTest::LockWal:BeforePrepareTxn2");
|
|
|
|
ASSERT_OK(txn2->Prepare());
|
|
|
|
ASSERT_OK(txn2->Commit());
|
|
|
|
});
|
|
|
|
ASSERT_OK(db->LockWAL());
|
|
|
|
// txn0 cannot prepare
|
|
|
|
Status s = txn0->Prepare();
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
// txn1 cannot commit
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("TransactionTest::LockWal:AfterLockWal");
|
|
|
|
|
|
|
|
ASSERT_OK(db->UnlockWAL());
|
|
|
|
txn0.reset();
|
|
|
|
|
|
|
|
txn0.reset(db->BeginTransaction(wopts, TransactionOptions()));
|
|
|
|
ASSERT_OK(txn0->SetName("txn0_1"));
|
|
|
|
ASSERT_OK(txn0->Put("foo", "v1"));
|
|
|
|
ASSERT_OK(txn0->Prepare());
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
worker.join();
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
Fix DelayWrite() calls for two_write_queues (#11130)
Summary:
PR https://github.com/facebook/rocksdb/issues/11020 fixed a case where it was easy to deadlock the DB with LockWAL() but introduced a bug showing up as a rare assertion failure in the stress test. Specifically, `assert(w->state == STATE_INIT)` in `WriteThread::LinkOne()` called from `BeginWriteStall()`, `DelayWrite()`, `WriteImplWALOnly()`. I haven't been about to generate a unit test that reproduces this failure but I believe the root cause is that DelayWrite() was never meant to be re-entrant, only called from the DB's write_thread_ leader. https://github.com/facebook/rocksdb/issues/11020 introduced a call to DelayWrite() from the nonmem_write_thread_ group leader.
This fix is to make DelayWrite() apply to the specific write queue that it is being called from (inject a dummy write stall entry to the head of the appropriate write queue). WriteController is re-entrant, based on polling and state changes signalled with bg_cv_, so can manage stalling two queues. The only anticipated complication (called out by Andrew in previous PR) is that we don't want timed write delays being injected in parallel for the two queues, because that dimishes the intended throttling effect. Thus, we only allow timed delays for the primary write queue.
HISTORY not updated because this is intended for the same release where the bug was introduced.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11130
Test Plan:
Although I was not able to reproduce the assertion failure, I was able to reproduce a distinct flaw with what I believe is the same root cause: a kind of deadlock if both write queues need to wake up from stopped writes. Only one will be waiting on bg_cv_ (the other waiting in `LinkOne()` for the write queue to open up), so a single SignalAll() will only unblock one of the queues, with the other re-instating the stop until another signal on bg_cv_. A simple unit test is added for this case.
Will also run crash_test_with_multiops_wc_txn for a while looking for issues.
Reviewed By: ajkr
Differential Revision: D42749330
Pulled By: pdillinger
fbshipit-source-id: 4317dd899a93d57c26fd5af7143038f82d4d4d1b
2 years ago
|
|
|
TEST_P(TransactionTest, StallTwoWriteQueues) {
|
|
|
|
// There was a two_write_queues bug in which both write thread leaders (for
|
|
|
|
// each queue) would attempt to own the stopping of writes in the primary
|
|
|
|
// write queue. This nearly worked but could lead to some broken assertions
|
|
|
|
// and a kind of deadlock in the test below. (Would resume if someone
|
|
|
|
// eventually signalled bg_cv_ again.)
|
|
|
|
if (!options.two_write_queues) {
|
|
|
|
ROCKSDB_GTEST_BYPASS("Test only needed with two_write_queues");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop writes
|
|
|
|
ASSERT_OK(db->LockWAL());
|
|
|
|
|
|
|
|
WriteOptions wopts;
|
|
|
|
wopts.sync = true;
|
|
|
|
wopts.disableWAL = false;
|
|
|
|
|
|
|
|
// Create one write thread that blocks in the primary write queue and one
|
|
|
|
// that blocks in the nonmem queue.
|
|
|
|
bool t1_completed = false;
|
|
|
|
bool t2_completed = false;
|
|
|
|
port::Thread t1{[&]() {
|
|
|
|
ASSERT_OK(db->Put(wopts, "x", "y"));
|
|
|
|
t1_completed = true;
|
|
|
|
}};
|
|
|
|
port::Thread t2{[&]() {
|
|
|
|
std::unique_ptr<Transaction> txn0{db->BeginTransaction(wopts, {})};
|
|
|
|
ASSERT_OK(txn0->SetName("xid"));
|
|
|
|
ASSERT_OK(txn0->Prepare()); // nonmem
|
|
|
|
ASSERT_OK(txn0->Commit());
|
|
|
|
t2_completed = true;
|
|
|
|
}};
|
|
|
|
|
|
|
|
// Sleep long enough to that above threads can usually reach a waiting point,
|
|
|
|
// to usually reveal deadlock if the bug is present.
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
// Ensure proper test setup
|
|
|
|
ASSERT_FALSE(t1_completed);
|
|
|
|
ASSERT_FALSE(t2_completed);
|
|
|
|
|
|
|
|
// Resume writes
|
|
|
|
ASSERT_OK(db->UnlockWAL());
|
|
|
|
|
|
|
|
// Wait for writes to finish
|
|
|
|
t1.join();
|
|
|
|
t2.join();
|
|
|
|
// Ensure proper test setup
|
|
|
|
ASSERT_TRUE(t1_completed);
|
|
|
|
ASSERT_TRUE(t2_completed);
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
int main(int /*argc*/, char** /*argv*/) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
10 years ago
|
|
|
#endif // ROCKSDB_LITE
|