From c2f2cb02142109dc411085e833f6a2265776fd32 Mon Sep 17 00:00:00 2001 From: agiardullo Date: Mon, 25 May 2015 17:37:33 -0700 Subject: [PATCH] Pessimistic Transactions Summary: Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it. MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues. Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable. Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing. Test Plan: Unit tests, db_bench parallel testing. Reviewers: igor, rven, sdong, yhchiang, yoshinorim Reviewed By: sdong Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D40869 --- CMakeLists.txt | 5 + HISTORY.md | 1 + Makefile | 6 +- db/db_bench.cc | 86 +- db/db_impl.cc | 48 +- db/db_impl.h | 10 + examples/Makefile | 7 +- examples/optimistic_transaction_example.cc | 142 ++ examples/transaction_example.cc | 52 +- include/rocksdb/status.h | 4 + .../utilities/optimistic_transaction.h | 233 --- .../utilities/optimistic_transaction_db.h | 18 +- include/rocksdb/utilities/transaction.h | 260 +++ include/rocksdb/utilities/transaction_db.h | 130 ++ src.mk | 5 + util/status.cc | 3 + .../optimistic_transaction_db_impl.cc | 4 +- .../optimistic_transaction_db_impl.h | 2 +- .../optimistic_transaction_impl.cc | 267 +-- .../optimistic_transaction_impl.h | 108 +- .../optimistic_transaction_test.cc | 318 +++- utilities/transactions/transaction_db_impl.cc | 254 +++ utilities/transactions/transaction_db_impl.h | 80 + utilities/transactions/transaction_impl.cc | 598 +++++++ utilities/transactions/transaction_impl.h | 263 +++ .../transactions/transaction_lock_mgr.cc | 443 +++++ utilities/transactions/transaction_lock_mgr.h | 90 + utilities/transactions/transaction_test.cc | 1587 +++++++++++++++++ utilities/transactions/transaction_util.cc | 265 +++ utilities/transactions/transaction_util.h | 65 + .../write_batch_with_index.cc | 13 +- 31 files changed, 4875 insertions(+), 492 deletions(-) create mode 100644 examples/optimistic_transaction_example.cc delete mode 100644 include/rocksdb/utilities/optimistic_transaction.h create mode 100644 include/rocksdb/utilities/transaction.h create mode 100644 include/rocksdb/utilities/transaction_db.h create mode 100644 utilities/transactions/transaction_db_impl.cc create mode 100644 utilities/transactions/transaction_db_impl.h create mode 100644 utilities/transactions/transaction_impl.cc create mode 100644 utilities/transactions/transaction_impl.h create mode 100644 utilities/transactions/transaction_lock_mgr.cc create mode 100644 utilities/transactions/transaction_lock_mgr.h create mode 100644 utilities/transactions/transaction_test.cc create mode 100644 utilities/transactions/transaction_util.cc create mode 100644 utilities/transactions/transaction_util.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ab5c136ac..1d1408349 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,6 +221,10 @@ set(SOURCES utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/transactions/optimistic_transaction_impl.cc utilities/transactions/optimistic_transaction_db_impl.cc + utilities/transactions/transaction_impl.cc + utilities/transactions/transaction_db_impl.cc + utilities/transactions/transaction_lock_mgr.cc + utilities/transactions/transaction_util.cc utilities/ttl/db_ttl_impl.cc utilities/write_batch_with_index/write_batch_with_index.cc utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -333,6 +337,7 @@ set(TESTS utilities/spatialdb/spatial_db_test.cc utilities/table_properties_collectors/compact_on_deletion_collector_test.cc utilities/transactions/optimistic_transaction_test.cc + utilities/transactions/transaction_test.cc utilities/ttl/ttl_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc ) diff --git a/HISTORY.md b/HISTORY.md index d590e3f91..d0eaac53f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -17,6 +17,7 @@ ## 3.12.0 (7/2/2015) ### New Features +* Added experimental support for pessimistic transactions. See include/rocksdb/utilities/transaction.h for more info. * Added experimental support for optimistic transactions. See include/rocksdb/utilities/optimistic_transaction.h for more info. * Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds) * Added a cache for individual rows. See DBOptions::row_cache for more info. diff --git a/Makefile b/Makefile index 4dbc0bfd5..2a956766a 100644 --- a/Makefile +++ b/Makefile @@ -304,7 +304,8 @@ TESTS = \ write_callback_test \ heap_test \ compact_on_deletion_collector_test \ - compaction_job_stats_test + compaction_job_stats_test \ + transaction_test SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/) @@ -919,6 +920,9 @@ write_callback_test: db/write_callback_test.o $(LIBOBJECTS) $(TESTHARNESS) heap_test: util/heap_test.o $(GTEST) $(AM_LINK) +transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + sst_dump: tools/sst_dump.o $(LIBOBJECTS) $(AM_LINK) diff --git a/db/db_bench.cc b/db/db_bench.cc index 2332afdbf..3c47a7839 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -54,7 +54,8 @@ int main() { #include "rocksdb/slice_transform.h" #include "rocksdb/perf_context.h" #include "rocksdb/utilities/flashcache.h" -#include "rocksdb/utilities/optimistic_transaction.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "port/port.h" #include "port/stack_trace.h" @@ -448,10 +449,14 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Ignored. Left here for backward compatibility"); -DEFINE_bool(transaction_db, false, +DEFINE_bool(optimistic_transaction_db, false, "Open a OptimisticTransactionDB instance. " "Required for randomtransaction benchmark."); +DEFINE_bool(transaction_db, false, + "Open a TransactionDB instance. " + "Required for randomtransaction benchmark."); + DEFINE_uint64(transaction_sets, 2, "Number of keys each transaction will " "modify (use in RandomTransaction only). Max: 9999"); @@ -919,7 +924,7 @@ static void AppendWithSpace(std::string* str, Slice msg) { struct DBWithColumnFamilies { std::vector cfh; DB* db; - OptimisticTransactionDB* txn_db; + OptimisticTransactionDB* opt_txn_db; std::atomic num_created; // Need to be updated after all the // new entries in cfh are set. size_t num_hot; // Number of column families to be queried at each moment. @@ -927,7 +932,7 @@ struct DBWithColumnFamilies { // Column families will be created and used to be queried. port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf() - DBWithColumnFamilies() : db(nullptr), txn_db(nullptr) { + DBWithColumnFamilies() : db(nullptr), opt_txn_db(nullptr) { cfh.clear(); num_created = 0; num_hot = 0; @@ -936,7 +941,7 @@ struct DBWithColumnFamilies { DBWithColumnFamilies(const DBWithColumnFamilies& other) : cfh(other.cfh), db(other.db), - txn_db(other.txn_db), + opt_txn_db(other.opt_txn_db), num_created(other.num_created.load()), num_hot(other.num_hot) {} @@ -944,9 +949,9 @@ struct DBWithColumnFamilies { std::for_each(cfh.begin(), cfh.end(), [](ColumnFamilyHandle* cfhi) { delete cfhi; }); cfh.clear(); - if (txn_db) { - delete txn_db; - txn_db = nullptr; + if (opt_txn_db) { + delete opt_txn_db; + opt_txn_db = nullptr; } else { delete db; } @@ -2445,11 +2450,19 @@ class Benchmark { if (FLAGS_readonly) { s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh, &db->db); - } else if (FLAGS_transaction_db) { + } else if (FLAGS_optimistic_transaction_db) { s = OptimisticTransactionDB::Open(options, db_name, column_families, - &db->cfh, &db->txn_db); + &db->cfh, &db->opt_txn_db); + if (s.ok()) { + db->db = db->opt_txn_db->GetBaseDB(); + } + } else if (FLAGS_transaction_db) { + TransactionDB* ptr; + TransactionDBOptions txn_db_options; + s = TransactionDB::Open(options, txn_db_options, db_name, + column_families, &db->cfh, &ptr); if (s.ok()) { - db->db = db->txn_db->GetBaseDB(); + db->db = ptr; } } else { s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); @@ -2459,11 +2472,19 @@ class Benchmark { db->num_hot = num_hot; } else if (FLAGS_readonly) { s = DB::OpenForReadOnly(options, db_name, &db->db); + } else if (FLAGS_optimistic_transaction_db) { + s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db); + if (s.ok()) { + db->db = db->opt_txn_db->GetBaseDB(); + } } else if (FLAGS_transaction_db) { - s = OptimisticTransactionDB::Open(options, db_name, &db->txn_db); + TransactionDB* ptr; + TransactionDBOptions txn_db_options; + s = TransactionDB::Open(options, txn_db_options, db_name, &ptr); if (s.ok()) { - db->db = db->txn_db->GetBaseDB(); + db->db = ptr; } + } else { s = DB::Open(options, db_name, &db->db); } @@ -3530,7 +3551,6 @@ class Benchmark { uint64_t transactions_aborted = 0; Status s; uint64_t num_prefix_ranges = FLAGS_transaction_sets; - bool use_txn = FLAGS_transaction_db; if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) { fprintf(stderr, "invalid value for transaction_sets\n"); @@ -3545,12 +3565,17 @@ class Benchmark { } while (!duration.Done(1)) { - OptimisticTransaction* txn = nullptr; + Transaction* txn = nullptr; WriteBatch* batch = nullptr; - if (use_txn) { - txn = db_.txn_db->BeginTransaction(write_options_); + if (FLAGS_optimistic_transaction_db) { + txn = db_.opt_txn_db->BeginTransaction(write_options_); assert(txn); + } else if (FLAGS_transaction_db) { + TransactionDB* txn_db = reinterpret_cast(db_.db); + TransactionOptions txn_options; + txn_options.expiration = 10000000; + txn = txn_db->BeginTransaction(write_options_, txn_options); } else { batch = new WriteBatch(); } @@ -3558,6 +3583,7 @@ class Benchmark { // pick a random number to use to increment a key in each set uint64_t incr = (thread->rand.Next() % 100) + 1; + bool failed = false; // For each set, pick a key at random and increment it for (uint8_t i = 0; i < num_prefix_ranges; i++) { uint64_t int_value; @@ -3572,8 +3598,8 @@ class Benchmark { std::string full_key = std::string(prefix_buf) + base_key.ToString(); Slice key(full_key); - if (use_txn) { - s = txn->Get(read_options, key, &value); + if (txn) { + s = txn->GetForUpdate(read_options, key, &value); } else { s = db->Get(read_options, key, &value); } @@ -3599,15 +3625,23 @@ class Benchmark { } std::string sum = ToString(int_value + incr); - if (use_txn) { - txn->Put(key, sum); + if (txn) { + s = txn->Put(key, sum); + if (!s.ok()) { + failed = true; + break; + } } else { batch->Put(key, sum); } } - if (use_txn) { - s = txn->Commit(); + if (txn) { + if (failed) { + txn->Rollback(); + } else { + s = txn->Commit(); + } } else { s = db->Write(write_options_, batch); } @@ -3616,7 +3650,7 @@ class Benchmark { // Ideally, we'd want to run this stress test with enough concurrency // on a small enough set of keys that we get some failed transactions // due to conflicts. - if (use_txn && s.IsBusy()) { + if (txn && s.IsBusy()) { transactions_aborted++; } else { fprintf(stderr, "Unexpected write error: %s\n", s.ToString().c_str()); @@ -3635,7 +3669,7 @@ class Benchmark { } char msg[100]; - if (use_txn) { + if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) { snprintf(msg, sizeof(msg), "( transactions:%" PRIu64 " aborts:%" PRIu64 ")", transactions_done, transactions_aborted); @@ -3653,7 +3687,7 @@ class Benchmark { // Since each iteration of RandomTransaction() incremented a key in each set // by the same value, the sum of the keys in each set should be the same. void RandomTransactionVerify() { - if (!FLAGS_transaction_db) { + if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) { // transactions not used, nothing to verify. return; } diff --git a/db/db_impl.cc b/db/db_impl.cc index 7652a66d6..9837ed3b4 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3686,7 +3686,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, mutex_.Lock(); } - if (db_options_.paranoid_checks && !status.ok() && + if (db_options_.paranoid_checks && !status.ok() && !status.IsTimedOut() && !status.IsBusy() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes } @@ -3944,6 +3944,22 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { return GetAndRefSuperVersion(cfd); } +// REQUIRED: mutex is NOT held +SuperVersion* DBImpl::GetAndRefSuperVersionUnlocked(uint32_t column_family_id) { + ColumnFamilyData* cfd; + { + InstrumentedMutexLock l(&mutex_); + auto column_family_set = versions_->GetColumnFamilySet(); + cfd = column_family_set->GetColumnFamily(column_family_id); + } + + if (!cfd) { + return nullptr; + } + + return GetAndRefSuperVersion(cfd); +} + void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv) { bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); @@ -3974,6 +3990,22 @@ void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, ReturnAndCleanupSuperVersion(cfd, sv); } +// REQUIRED: Mutex should NOT be held. +void DBImpl::ReturnAndCleanupSuperVersionUnlocked(uint32_t column_family_id, + SuperVersion* sv) { + ColumnFamilyData* cfd; + { + InstrumentedMutexLock l(&mutex_); + auto column_family_set = versions_->GetColumnFamilySet(); + cfd = column_family_set->GetColumnFamily(column_family_id); + } + + // If SuperVersion is held, and we successfully fetched a cfd using + // GetAndRefSuperVersion(), it must still exist. + assert(cfd != nullptr); + ReturnAndCleanupSuperVersion(cfd, sv); +} + // REQUIRED: this function should only be called on the write thread or if the // mutex is held. ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { @@ -3986,6 +4018,20 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { return cf_memtables->GetColumnFamilyHandle(); } +// REQUIRED: mutex is NOT held. +ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked( + uint32_t column_family_id) { + ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); + + InstrumentedMutexLock l(&mutex_); + + if (!cf_memtables->Seek(column_family_id)) { + return nullptr; + } + + return cf_memtables->GetColumnFamilyHandle(); +} + void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, const Range* range, int n, uint64_t* sizes, bool include_memtable) { diff --git a/db/db_impl.h b/db/db_impl.h index d2f4b868d..6a7d1c3f5 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -326,6 +326,9 @@ class DBImpl : public DB { // mutex is held. SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id); + // Same as above, should called without mutex held and not on write thread. + SuperVersion* GetAndRefSuperVersionUnlocked(uint32_t column_family_id); + // Un-reference the super version and return it to thread local cache if // needed. If it is the last reference of the super version. Clean it up // after un-referencing it. @@ -336,11 +339,18 @@ class DBImpl : public DB { // REQUIRED: this function should only be called on the write thread. void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv); + // Same as above, should called without mutex held and not on write thread. + void ReturnAndCleanupSuperVersionUnlocked(uint32_t colun_family_id, + SuperVersion* sv); + // REQUIRED: this function should only be called on the write thread or if the // mutex is held. Return value only valid until next call to this function or // mutex is released. ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); + // Same as above, should called without mutex held and not on write thread. + ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id); + protected: Env* const env_; const std::string dbname_; diff --git a/examples/Makefile b/examples/Makefile index 1535d9b29..0757f5f03 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -2,7 +2,7 @@ include ../make_config.mk .PHONY: clean -all: simple_example column_families_example compact_files_example c_simple_example transaction_example +all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example simple_example: simple_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) @@ -19,8 +19,11 @@ compact_files_example: compact_files_example.cc c_simple_example: c_simple_example.o $(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS) +optimistic_transaction_example: optimistic_transaction_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + transaction_example: transaction_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) clean: - rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./transaction_example + rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example diff --git a/examples/optimistic_transaction_example.cc b/examples/optimistic_transaction_example.cc new file mode 100644 index 000000000..e9ab0e5ee --- /dev/null +++ b/examples/optimistic_transaction_example.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" + +using namespace rocksdb; + +std::string kDBPath = "/tmp/rocksdb_transaction_example"; + +int main() { + // open DB + Options options; + options.create_if_missing = true; + DB* db; + OptimisticTransactionDB* txn_db; + + Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db); + assert(s.ok()); + db = txn_db->GetBaseDB(); + + WriteOptions write_options; + ReadOptions read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + //////////////////////////////////////////////////////// + // + // Simple OptimisticTransaction Example ("Read Committed") + // + //////////////////////////////////////////////////////// + + // Start a transaction + Transaction* txn = txn_db->BeginTransaction(write_options); + assert(txn); + + // Read a key in this transaction + s = txn->Get(read_options, "abc", &value); + assert(s.IsNotFound()); + + // Write a key in this transaction + txn->Put("abc", "def"); + + // Read a key OUTSIDE this transaction. Does not affect txn. + s = db->Get(read_options, "abc", &value); + + // Write a key OUTSIDE of this transaction. + // Does not affect txn since this is an unrelated key. If we wrote key 'abc' + // here, the transaction would fail to commit. + s = db->Put(write_options, "xyz", "zzz"); + + // Commit transaction + s = txn->Commit(); + assert(s.ok()); + delete txn; + + //////////////////////////////////////////////////////// + // + // "Repeatable Read" (Snapshot Isolation) Example + // -- Using a single Snapshot + // + //////////////////////////////////////////////////////// + + // Set a snapshot at start of transaction by setting set_snapshot=true + txn_options.set_snapshot = true; + txn = txn_db->BeginTransaction(write_options, txn_options); + + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write a key OUTSIDE of transaction + db->Put(write_options, "abc", "xyz"); + + // Read a key using the snapshot + read_options.snapshot = snapshot; + s = txn->GetForUpdate(read_options, "abc", &value); + assert(value == "def"); + + // Attempt to commit transaction + s = txn->Commit(); + + // Transaction could not commit since the write outside of the txn conflicted + // with the read! + assert(s.IsBusy()); + + delete txn; + // Clear snapshot from read options since it is no longer valid + read_options.snapshot = nullptr; + snapshot = nullptr; + + //////////////////////////////////////////////////////// + // + // "Read Committed" (Monotonic Atomic Views) Example + // --Using multiple Snapshots + // + //////////////////////////////////////////////////////// + + // In this example, we set the snapshot multiple times. This is probably + // only necessary if you have very strict isolation requirements to + // implement. + + // Set a snapshot at start of transaction + txn_options.set_snapshot = true; + txn = txn_db->BeginTransaction(write_options, txn_options); + + // Do some reads and writes to key "x" + read_options.snapshot = db->GetSnapshot(); + s = txn->Get(read_options, "x", &value); + txn->Put("x", "x"); + + // Do a write outside of the transaction to key "y" + s = db->Put(write_options, "y", "y"); + + // Set a new snapshot in the transaction + txn->SetSnapshot(); + read_options.snapshot = db->GetSnapshot(); + + // Do some reads and writes to key "y" + s = txn->GetForUpdate(read_options, "y", &value); + txn->Put("y", "y"); + + // Commit. Since the snapshot was advanced, the write done outside of the + // transaction does not prevent this transaction from Committing. + s = txn->Commit(); + assert(s.ok()); + delete txn; + // Clear snapshot from read options since it is no longer valid + read_options.snapshot = nullptr; + + // Cleanup + delete txn_db; + DestroyDB(kDBPath, options); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/examples/transaction_example.cc b/examples/transaction_example.cc index 02f309c59..a7d506129 100644 --- a/examples/transaction_example.cc +++ b/examples/transaction_example.cc @@ -8,8 +8,8 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "rocksdb/utilities/optimistic_transaction.h" -#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" using namespace rocksdb; @@ -18,17 +18,16 @@ std::string kDBPath = "/tmp/rocksdb_transaction_example"; int main() { // open DB Options options; + TransactionDBOptions txn_db_options; options.create_if_missing = true; - DB* db; - OptimisticTransactionDB* txn_db; + TransactionDB* txn_db; - Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db); + Status s = TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db); assert(s.ok()); - db = txn_db->GetBaseDB(); WriteOptions write_options; ReadOptions read_options; - OptimisticTransactionOptions txn_options; + TransactionOptions txn_options; std::string value; //////////////////////////////////////////////////////// @@ -38,7 +37,7 @@ int main() { //////////////////////////////////////////////////////// // Start a transaction - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); assert(txn); // Read a key in this transaction @@ -46,15 +45,16 @@ int main() { assert(s.IsNotFound()); // Write a key in this transaction - txn->Put("abc", "def"); + s = txn->Put("abc", "def"); + assert(s.ok()); // Read a key OUTSIDE this transaction. Does not affect txn. - s = db->Get(read_options, "abc", &value); + s = txn_db->Get(read_options, "abc", &value); // Write a key OUTSIDE of this transaction. // Does not affect txn since this is an unrelated key. If we wrote key 'abc' // here, the transaction would fail to commit. - s = db->Put(write_options, "xyz", "zzz"); + s = txn_db->Put(write_options, "xyz", "zzz"); // Commit transaction s = txn->Commit(); @@ -75,20 +75,17 @@ int main() { const Snapshot* snapshot = txn->GetSnapshot(); // Write a key OUTSIDE of transaction - db->Put(write_options, "abc", "xyz"); + s = txn_db->Put(write_options, "abc", "xyz"); + assert(s.ok()); - // Read a key using the snapshot + // Attempt to read a key using the snapshot. This will fail since + // the previous write outside this txn conflicts with this read. read_options.snapshot = snapshot; s = txn->GetForUpdate(read_options, "abc", &value); - assert(value == "def"); - - // Attempt to commit transaction - s = txn->Commit(); - - // Transaction could not commit since the write outside of the txn conflicted - // with the read! assert(s.IsBusy()); + txn->Rollback(); + delete txn; // Clear snapshot from read options since it is no longer valid read_options.snapshot = nullptr; @@ -110,23 +107,28 @@ int main() { txn = txn_db->BeginTransaction(write_options, txn_options); // Do some reads and writes to key "x" - read_options.snapshot = db->GetSnapshot(); + read_options.snapshot = txn_db->GetSnapshot(); s = txn->Get(read_options, "x", &value); txn->Put("x", "x"); // Do a write outside of the transaction to key "y" - s = db->Put(write_options, "y", "y"); + s = txn_db->Put(write_options, "y", "y"); // Set a new snapshot in the transaction txn->SetSnapshot(); - read_options.snapshot = db->GetSnapshot(); + txn->SetSavePoint(); + read_options.snapshot = txn_db->GetSnapshot(); // Do some reads and writes to key "y" + // Since the snapshot was advanced, the write done outside of the + // transaction does not conflict. s = txn->GetForUpdate(read_options, "y", &value); txn->Put("y", "y"); - // Commit. Since the snapshot was advanced, the write done outside of the - // transaction does not prevent this transaction from Committing. + // Decide we want to revert the last write from this transaction. + txn->RollbackToSavePoint(); + + // Commit. s = txn->Commit(); assert(s.ok()); delete txn; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index d99fabac0..888f21266 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -80,6 +80,10 @@ class Status { static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kBusy, msg, msg2); } + static Status TimedOut() { return Status(kTimedOut); } + static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTimedOut, msg, msg2); + } // Returns true iff the status indicates success. bool ok() const { return code() == kOk; } diff --git a/include/rocksdb/utilities/optimistic_transaction.h b/include/rocksdb/utilities/optimistic_transaction.h deleted file mode 100644 index c3f18f356..000000000 --- a/include/rocksdb/utilities/optimistic_transaction.h +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#ifndef ROCKSDB_LITE - -#include -#include - -#include "rocksdb/comparator.h" -#include "rocksdb/db.h" -#include "rocksdb/status.h" - -namespace rocksdb { - -class OptimisticTransactionDB; -class WriteBatchWithIndex; - -// Provides BEGIN/COMMIT/ROLLBACK transactions for batched writes. -// -// The current implementation provides optimistic concurrency control. -// Transactional reads/writes will not block other operations in the -// db. At commit time, the batch of writes will only be written if there have -// been no other writes to any keys read or written by this transaction. -// Otherwise, the commit will return an error. -// -// A new optimistic transaction is created by calling -// OptimisticTransactionDB::BeginTransaction(). -// Only reads/writes done through this transaction object will be a part of the -// transaction. Any other reads/writes will not be tracked by this -// transaction. -// -// For example, reading data via OptimisticTransaction::GetForUpdate() will -// prevent the transaction from committing if this key is written to outside of -// this transaction. Any reads done via DB::Get() will not be checked for -// conflicts at commit time. -// -// It is up to the caller to synchronize access to this object. -// -// See examples/transaction_example.cc for some simple examples. -// -// TODO(agiardullo): Not yet implemented: -// -Transaction support for iterators -// -Ensuring memtable holds large enough history to check for conflicts -// -Support for using Transactions with DBWithTTL - -// Options to use when starting an Optimistic Transaction -struct OptimisticTransactionOptions { - // Setting set_snapshot=true is the same as calling SetSnapshot(). - bool set_snapshot = false; - - // Should be set if the DB has a non-default comparator. - // See comment in WriteBatchWithIndex constructor. - const Comparator* cmp = BytewiseComparator(); -}; - -class OptimisticTransaction { - public: - virtual ~OptimisticTransaction() {} - - // If SetSnapshot() is not called, all keys read/written through this - // transaction will only be committed if there have been no writes to - // these keys outside of this transaction *since the time each key - // was first read/written* in this transaction. - // - // When SetSnapshot() is called, this transaction will create a Snapshot - // to use for conflict validation of all future operations in the transaction. - // All future keys read/written will only be committed if there have been - // no writes to these keys outside of this transaction *since SetSnapshot() - // was called.* Otherwise, Commit() will not succeed. - // - // It is not necessary to call SetSnapshot() if you only care about other - // writes happening on keys *after* they have first been read/written in this - // transaction. However, you should set a snapshot if you are concerned - // with any other writes happening since a particular time (such as - // the start of the transaction). - // - // SetSnapshot() may be called multiple times if you would like to change - // the snapshot used for different operations in this transaction. - // - // Calling SetSnapshot will not affect the version of Data returned by Get() - // methods. See OptimisticTransaction::Get() for more details. - // - // TODO(agiardullo): add better documentation here once memtable change are - // committed - virtual void SetSnapshot() = 0; - - // Returns the Snapshot created by the last call to SetSnapshot(). - // - // REQUIRED: The returned Snapshot is only valid up until the next time - // SetSnapshot() is called or the OptimisticTransaction is deleted. - virtual const Snapshot* GetSnapshot() const = 0; - - // Write all batched keys to the db atomically if there have not been any - // other writes performed on the keys read/written by this transaction. - // - // Currently, Commit() only checks the memtables to verify that there are no - // other writes to these keys. If the memtable's history is not long - // enough to verify that there are no conflicts, Commit() will return - // a non-OK status. - // - // Returns OK on success, non-OK on failure. - virtual Status Commit() = 0; - - // Discard all batched writes in this transaction. - virtual void Rollback() = 0; - - // This function is similar to DB::Get() except it will also read pending - // changes in this transaction. - // - // If read_options.snapshot is not set, the current version of the key will - // be read. Calling SetSnapshot() does not affect the version of the data - // returned. - // - // Note that setting read_options.snapshot will affect what is read from the - // DB but will NOT change which keys are read from this transaction (the keys - // in this transaction do not yet belong to any snapshot and will be fetched - // regardless). - // - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value) = 0; - - virtual Status Get(const ReadOptions& options, const Slice& key, - std::string* value) = 0; - - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) = 0; - - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values) = 0; - - // Read this key and ensure that this transaction will only - // be able to be committed if this key is not written outside this - // transaction after it has first been read (or after the snapshot if a - // snapshot is set in this transaction). - - // This function is similar to OptimisticTransaction::Get() except it will - // affect whether this transaction will be able to be committed. - virtual Status GetForUpdate(const ReadOptions& options, - ColumnFamilyHandle* column_family, - const Slice& key, std::string* value) = 0; - - virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, - std::string* value) = 0; - - virtual std::vector MultiGetForUpdate( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) = 0; - - virtual std::vector MultiGetForUpdate( - const ReadOptions& options, const std::vector& keys, - std::vector* values) = 0; - - // Put, Merge, and Delete behave similarly to their corresponding - // functions in WriteBatch. In addition, this transaction will only - // be able to be committed if these keys are not written outside of this - // transaction after they have been written by this transaction (or after the - // snapshot if a snapshot is set in this transaction). - virtual void Put(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) = 0; - virtual void Put(const Slice& key, const Slice& value) = 0; - virtual void Put(ColumnFamilyHandle* column_family, const SliceParts& key, - const SliceParts& value) = 0; - virtual void Put(const SliceParts& key, const SliceParts& value) = 0; - - virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) = 0; - virtual void Merge(const Slice& key, const Slice& value) = 0; - - virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key) = 0; - virtual void Delete(const Slice& key) = 0; - virtual void Delete(ColumnFamilyHandle* column_family, - const SliceParts& key) = 0; - virtual void Delete(const SliceParts& key) = 0; - - // PutUntracked() will write a Put to the batch of operations to be committed - // in this transaction. This write will only happen if this transaction - // gets committed successfully. But unlike OptimisticTransaction::Put(), - // no conflict checking will be done for this key. So any other writes to - // this key outside of this transaction will not prevent this transaction from - // committing. - virtual void PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) = 0; - virtual void PutUntracked(const Slice& key, const Slice& value) = 0; - virtual void PutUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key, const SliceParts& value) = 0; - virtual void PutUntracked(const SliceParts& key, const SliceParts& value) = 0; - - virtual void MergeUntracked(ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value) = 0; - virtual void MergeUntracked(const Slice& key, const Slice& value) = 0; - - virtual void DeleteUntracked(ColumnFamilyHandle* column_family, - const Slice& key) = 0; - - virtual void DeleteUntracked(const Slice& key) = 0; - virtual void DeleteUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key) = 0; - virtual void DeleteUntracked(const SliceParts& key) = 0; - - // Similar to WriteBatch::PutLogData - virtual void PutLogData(const Slice& blob) = 0; - - // Fetch the underlying write batch that contains all pending changes to be - // committed. - // - // Note: You should not write or delete anything from the batch directly and - // should only use the the functions in the OptimisticTransaction class to - // write to this transaction. - virtual WriteBatchWithIndex* GetWriteBatch() = 0; - - protected: - // To begin a new transaction, see OptimisticTransactionDB::BeginTransaction() - explicit OptimisticTransaction(const OptimisticTransactionDB* db) {} - OptimisticTransaction() {} - - private: - // No copying allowed - OptimisticTransaction(const OptimisticTransaction&); - void operator=(const OptimisticTransaction&); -}; - -} // namespace rocksdb - -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 677f39122..772e64549 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -11,16 +11,25 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" -#include "rocksdb/utilities/optimistic_transaction.h" namespace rocksdb { -class OptimisticTransaction; +class Transaction; // Database with Transaction support. // // See optimistic_transaction.h and examples/transaction_example.cc +// Options to use when starting an Optimistic Transaction +struct OptimisticTransactionOptions { + // Setting set_snapshot=true is the same as calling SetSnapshot(). + bool set_snapshot = false; + + // Should be set if the DB has a non-default comparator. + // See comment in WriteBatchWithIndex constructor. + const Comparator* cmp = BytewiseComparator(); +}; + class OptimisticTransactionDB { public: // Open an OptimisticTransactionDB similar to DB::Open(). @@ -34,13 +43,12 @@ class OptimisticTransactionDB { virtual ~OptimisticTransactionDB() {} - // Starts a new OptimisticTransaction. Passing set_snapshot=true has the same - // effect + // Starts a new Transaction. Passing set_snapshot=true has the same effect // as calling SetSnapshot(). // // Caller should delete the returned transaction after calling // Commit() or Rollback(). - virtual OptimisticTransaction* BeginTransaction( + virtual Transaction* BeginTransaction( const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options = OptimisticTransactionOptions()) = 0; diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h new file mode 100644 index 000000000..86345efed --- /dev/null +++ b/include/rocksdb/utilities/transaction.h @@ -0,0 +1,260 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class Iterator; +class TransactionDB; +class WriteBatchWithIndex; + +// Provides BEGIN/COMMIT/ROLLBACK transactions. +// +// To use transactions, you must first create either an OptimisticTransactionDB +// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for +// more information. +// +// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction(). +// +// It is up to the caller to synchronize access to this object. +// +// See examples/transaction_example.cc for some simple examples. +// +// TODO(agiardullo): Not yet implemented +// -PerfContext statistics +// -Support for using Transactions with DBWithTTL +class Transaction { + public: + virtual ~Transaction() {} + + // If a transaction has a snapshot set, the transaction will ensure that + // any keys successfully written(or fetched via GetForUpdate()) have not + // been modified outside of this transaction since the time the snapshot was + // set. + // If a snapshot has not been set, the transaction guarantees that keys have + // not been modified since the time each key was first written (or fetched via + // GetForUpdate()). + // + // Using SetSnapshot() will provide stricter isolation guarantees at the + // expense of potentially more transaction failures due to conflicts with + // other writes. + // + // Calling SetSnapshot() has no effect on keys written before this function + // has been called. + // + // SetSnapshot() may be called multiple times if you would like to change + // the snapshot used for different operations in this transaction. + // + // Calling SetSnapshot will not affect the version of Data returned by Get() + // methods. See Transaction::Get() for more details. + virtual void SetSnapshot() = 0; + + // Returns the Snapshot created by the last call to SetSnapshot(). + // + // REQUIRED: The returned Snapshot is only valid up until the next time + // SetSnapshot() is called or the Transaction is deleted. + virtual const Snapshot* GetSnapshot() const = 0; + + // Write all batched keys to the db atomically. + // + // Returns OK on success. + // + // May return any error status that could be returned by DB:Write(). + // + // If this transaction was created by an OptimisticTransactionDB(), + // Status::Busy() may be returned if the transaction could not guarantee + // that there are no write conflicts. + // + // If this transaction was created by a TransactionDB(), Status::TimedOut() + // may be returned if this transaction has lived for longer than + // TransactionOptions.expiration. + virtual Status Commit() = 0; + + // Discard all batched writes in this transaction. + virtual void Rollback() = 0; + + // Records the state of the transaction for future calls to + // RollbackToSavePoint(). May be called multiple times to set multiple save + // points. + virtual void SetSavePoint() = 0; + + // Undo all operations in this transaction (Put, Merge, Delete, PutLogData) + // since the + // most recent call to SetSavePoint() and removes the most recent + // SetSavePoint(). + // If there is no previous call to SetSavePoint(), behaves the same as + // Rollback() + virtual void RollbackToSavePoint() = 0; + + // This function is similar to DB::Get() except it will also read pending + // changes in this transaction. + // + // If read_options.snapshot is not set, the current version of the key will + // be read. Calling SetSnapshot() does not affect the version of the data + // returned. + // + // Note that setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) = 0; + + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) = 0; + + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) = 0; + + // Read this key and ensure that this transaction will only + // be able to be committed if this key is not written outside this + // transaction after it has first been read (or after the snapshot if a + // snapshot is set in this transaction). The transaction behavior is the + // same regardless of whether the key exists or not. + // + // The values returned by this function are similar to Transaction::Get(). + // If value==nullptr, then this function will not read any data, but will + // still ensure that this key cannot be written to by outside of this + // transaction. + // + // If this transaction was created by a TransactionDB, Status::Busy() may be + // returned. + // If this transaction was created by an OptimisticTransaction, GetForUpdate() + // could cause commit() to later return Status::Busy(). + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) = 0; + + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) = 0; + + // Returns an iterator that will iterate on all keys in the default + // column family including both keys in the DB and uncommitted keys in this + // transaction. + // + // Setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + // + // Caller is reponsible for deleting the returned Iterator. + // + // The returned iterator is only valid until Commit(), Rollback(), or + // RollbackToSavePoint() is called. + // NOTE: Transaction::Put/Merge/Delete will currently invalidate this iterator + // until + // the following issue is fixed: + // https://github.com/facebook/rocksdb/issues/616 + virtual Iterator* GetIterator(const ReadOptions& read_options) = 0; + + virtual Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) = 0; + + // Put, Merge, and Delete behave similarly to their corresponding + // functions in WriteBatch, but will also do conflict checking on the + // keys being written. + // + // If this Transaction was created on an OptimisticTransactionDB, these + // functions should always return Status::OK(). + // If this Transaction was created on a TransactionDB, the functions can + // return Status::Busy() if they could not acquire a lock. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) = 0; + virtual Status Put(const SliceParts& key, const SliceParts& value) = 0; + + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + + virtual Status Delete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const Slice& key) = 0; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) = 0; + virtual Status Delete(const SliceParts& key) = 0; + + // PutUntracked() will write a Put to the batch of operations to be committed + // in this transaction. This write will only happen if this transaction + // gets committed successfully. But unlike Transaction::Put(), + // no conflict checking will be done for this key. + // + // If this Transaction was created on a TransactionDB, this function will + // still acquire locks necessary to make sure this write doesn't cause + // conflicts in + // other transactions and may return Status::Busy(). + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) = 0; + virtual Status PutUntracked(const SliceParts& key, + const SliceParts& value) = 0; + + virtual Status MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0; + + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status DeleteUntracked(const Slice& key) = 0; + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) = 0; + virtual Status DeleteUntracked(const SliceParts& key) = 0; + + // Similar to WriteBatch::PutLogData + virtual void PutLogData(const Slice& blob) = 0; + + // Fetch the underlying write batch that contains all pending changes to be + // committed. + // + // Note: You should not write or delete anything from the batch directly and + // should only use the the functions in the Transaction class to + // write to this transaction. + virtual WriteBatchWithIndex* GetWriteBatch() = 0; + + protected: + explicit Transaction(const TransactionDB* db) {} + Transaction() {} + + private: + // No copying allowed + Transaction(const Transaction&); + void operator=(const Transaction&); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h new file mode 100644 index 000000000..0f9a1773e --- /dev/null +++ b/include/rocksdb/utilities/transaction_db.h @@ -0,0 +1,130 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" + +// Database with Transaction support. +// +// See transaction.h and examples/transaction_example.cc + +namespace rocksdb { + +struct TransactionDBOptions { + // Specifies the maximum number of keys that can be locked at the same time + // per column family. + // If the number of locked keys is greater than max_num_locks, transaction + // writes (or GetForUpdate) will return an error. + // If this value is not positive, no limit will be enforced. + int64_t max_num_locks = -1; + + // Increasing this value will increase the concurrency by dividing the lock + // table (per column family) into more sub-tables, each with their own + // separate + // mutex. + size_t num_stripes = 16; + + // If positive, specifies the default wait timeout in milliseconds when + // a transaction attempts to lock a key if not specified by + // TransactionOptions::lock_timeout. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout. Not using a timeout is not recommended + // as it can lead to deadlocks. Currently, there is no deadlock-detection to + // recover + // from a deadlock. + int64_t transaction_lock_timeout = 1000; // 1 second + + // If positive, specifies the wait timeout in milliseconds when writing a key + // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() + // directly). + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout and will block indefinitely when acquiring + // a lock. + // + // Not using a a timeout can lead to deadlocks. Currently, there + // is no deadlock-detection to recover from a deadlock. While DB writes + // cannot deadlock with other DB writes, they can deadlock with a transaction. + // A negative timeout should only be used if all transactions have an small + // expiration set. + int64_t default_lock_timeout = 1000; // 1 second +}; + +struct TransactionOptions { + // Setting set_snapshot=true is the same as calling + // Transaction::SetSnapshot(). + bool set_snapshot = false; + + + // TODO(agiardullo): TransactionDB does not yet support comparators that allow + // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only + // return 0 if + // a.compare(b) returns 0. + + + // If positive, specifies the wait timeout in milliseconds when + // a transaction attempts to lock a key. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, TransactionDBOptions::transaction_lock_timeout will be used. + int64_t lock_timeout = -1; + + // Expiration duration in milliseconds. If non-negative, transactions that + // last longer than this many milliseconds will fail to commit. If not set, + // a forgotten transaction that is never committed, rolled back, or deleted + // will never relinquish any locks it holds. This could prevent keys from + // being + // written by other writers. + // + // TODO(agiardullo): Improve performance of checking expiration time. + int64_t expiration = -1; +}; + +class TransactionDB : public StackableDB { + public: + // Open a TransactionDB similar to DB::Open(). + static Status Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + TransactionDB** dbptr); + + virtual ~TransactionDB() {} + + // Starts a new Transaction. Passing set_snapshot=true has the same effect + // as calling Transaction::SetSnapshot(). + // + // Caller should delete the returned transaction after calling + // Transaction::Commit() or Transaction::Rollback(). + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions()) = 0; + + protected: + // To Create an TransactionDB, call Open() + explicit TransactionDB(DB* db) : StackableDB(db) {} + + private: + // No copying allowed + TransactionDB(const TransactionDB&); + void operator=(const TransactionDB&); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src.mk b/src.mk index 4e94c7d94..8a6c4dc7f 100644 --- a/src.mk +++ b/src.mk @@ -118,6 +118,10 @@ LIB_SOURCES = \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/transactions/optimistic_transaction_impl.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \ + utilities/transactions/transaction_db_impl.cc \ + utilities/transactions/transaction_lock_mgr.cc \ + utilities/transactions/transaction_impl.cc \ + utilities/transactions/transaction_util.cc \ utilities/ttl/db_ttl_impl.cc \ utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \ @@ -235,6 +239,7 @@ TEST_BENCH_SOURCES = \ utilities/spatialdb/spatial_db_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/transactions/optimistic_transaction_test.cc \ + utilities/transactions/transaction_test.cc \ utilities/ttl/ttl_test.cc \ utilities/write_batch_with_index/write_batch_with_index_test.cc \ util/log_write_bench.cc \ diff --git a/util/status.cc b/util/status.cc index d956eb476..3fe292dd3 100644 --- a/util/status.cc +++ b/util/status.cc @@ -67,6 +67,9 @@ std::string Status::ToString() const { case kShutdownInProgress: type = "Shutdown in progress: "; break; + case kTimedOut: + type = "Operation timed out: "; + break; case kAborted: type = "Operation aborted: "; break; diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc index 56f612021..ca9897211 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.cc +++ b/utilities/transactions/optimistic_transaction_db_impl.cc @@ -18,10 +18,10 @@ namespace rocksdb { -OptimisticTransaction* OptimisticTransactionDBImpl::BeginTransaction( +Transaction* OptimisticTransactionDBImpl::BeginTransaction( const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options) { - OptimisticTransaction* txn = + Transaction* txn = new OptimisticTransactionImpl(this, write_options, txn_options); return txn; diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index bfd452990..ec5b42823 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -19,7 +19,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { ~OptimisticTransactionDBImpl() {} - OptimisticTransaction* BeginTransaction( + Transaction* BeginTransaction( const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options) override; diff --git a/utilities/transactions/optimistic_transaction_impl.cc b/utilities/transactions/optimistic_transaction_impl.cc index d45117236..1defd32a6 100644 --- a/utilities/transactions/optimistic_transaction_impl.cc +++ b/utilities/transactions/optimistic_transaction_impl.cc @@ -7,11 +7,7 @@ #include "utilities/transactions/optimistic_transaction_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include @@ -22,6 +18,7 @@ #include "rocksdb/status.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "util/string_util.h" +#include "utilities/transactions/transaction_util.h" namespace rocksdb { @@ -34,7 +31,8 @@ OptimisticTransactionImpl::OptimisticTransactionImpl( db_(txn_db->GetBaseDB()), write_options_(write_options), snapshot_(nullptr), - write_batch_(txn_options.cmp, 0, true) { + cmp_(txn_options.cmp), + write_batch_(new WriteBatchWithIndex(txn_options.cmp, 0, true)) { if (txn_options.set_snapshot) { SetSnapshot(); } else { @@ -72,11 +70,12 @@ Status OptimisticTransactionImpl::Commit() { } Status s = db_impl->WriteWithCallback( - write_options_, write_batch_.GetWriteBatch(), &callback); + write_options_, write_batch_->GetWriteBatch(), &callback); if (s.ok()) { tracked_keys_.clear(); - write_batch_.Clear(); + write_batch_->Clear(); + num_entries_ = 0; } return s; @@ -84,7 +83,57 @@ Status OptimisticTransactionImpl::Commit() { void OptimisticTransactionImpl::Rollback() { tracked_keys_.clear(); - write_batch_.Clear(); + write_batch_->Clear(); + num_entries_ = 0; +} + +void OptimisticTransactionImpl::SetSavePoint() { + if (num_entries_ > 0) { + // If transaction is empty, no need to record anything. + + if (save_points_ == nullptr) { + save_points_.reset(new std::stack()); + } + save_points_->push(num_entries_); + } +} + +void OptimisticTransactionImpl::RollbackToSavePoint() { + size_t savepoint_entries = 0; + + if (save_points_ != nullptr && save_points_->size() > 0) { + savepoint_entries = save_points_->top(); + save_points_->pop(); + } + + assert(savepoint_entries <= num_entries_); + + if (savepoint_entries == num_entries_) { + // No changes to rollback + } else if (savepoint_entries == 0) { + // Rollback everything + Rollback(); + } else { + DBImpl* db_impl = dynamic_cast(db_->GetRootDB()); + assert(db_impl); + + WriteBatchWithIndex* new_batch = new WriteBatchWithIndex(cmp_, 0, true); + Status s = TransactionUtil::CopyFirstN( + savepoint_entries, write_batch_.get(), new_batch, db_impl); + + if (!s.ok()) { + // TODO: Should we change this function to return a Status or should we + // somehow make it + // so RollbackToSavePoint() can never fail?? + // Consider moving this functionality into WriteBatchWithIndex + fprintf(stderr, "STATUS: %s \n", s.ToString().c_str()); + delete new_batch; + } else { + write_batch_.reset(new_batch); + } + + num_entries_ = savepoint_entries; + } } // Record this key so that we can check it for conflicts at commit time. @@ -135,8 +184,8 @@ void OptimisticTransactionImpl::RecordOperation( Status OptimisticTransactionImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { - return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, - value); + return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key, + value); } Status OptimisticTransactionImpl::GetForUpdate( @@ -145,7 +194,11 @@ Status OptimisticTransactionImpl::GetForUpdate( // Regardless of whether the Get succeeded, track this key. RecordOperation(column_family, key); - return Get(read_options, column_family, key, value); + if (value == nullptr) { + return Status::OK(); + } else { + return Get(read_options, column_family, key, value); + } } std::vector OptimisticTransactionImpl::MultiGet( @@ -159,7 +212,7 @@ std::vector OptimisticTransactionImpl::MultiGet( // TODO(agiardullo): optimize multiget? std::vector stat_list(num_keys); for (size_t i = 0; i < num_keys; ++i) { - std::string* value = &(*values)[i]; + std::string* value = values ? &(*values)[i] : nullptr; stat_list[i] = Get(read_options, column_family[i], keys[i], value); } @@ -180,169 +233,141 @@ std::vector OptimisticTransactionImpl::MultiGetForUpdate( // Regardless of whether the Get succeeded, track this key. RecordOperation(column_family[i], keys[i]); - std::string* value = &(*values)[i]; + std::string* value = values ? &(*values)[i] : nullptr; stat_list[i] = Get(read_options, column_family[i], keys[i], value); } return stat_list; } -void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value) { +Iterator* OptimisticTransactionImpl::GetIterator( + const ReadOptions& read_options) { + Iterator* db_iter = db_->NewIterator(read_options); + assert(db_iter); + + return write_batch_->NewIteratorWithBase(db_iter); +} + +Iterator* OptimisticTransactionImpl::GetIterator( + const ReadOptions& read_options, ColumnFamilyHandle* column_family) { + Iterator* db_iter = db_->NewIterator(read_options, column_family); + assert(db_iter); + + return write_batch_->NewIteratorWithBase(column_family, db_iter); +} + +Status OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { RecordOperation(column_family, key); - write_batch_.Put(column_family, key, value); + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, - const SliceParts& key, - const SliceParts& value) { +Status OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { RecordOperation(column_family, key); - write_batch_.Put(column_family, key, value); + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::Merge(ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value) { +Status OptimisticTransactionImpl::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { RecordOperation(column_family, key); - write_batch_.Merge(column_family, key, value); + write_batch_->Merge(column_family, key, value); + + return Status::OK(); } -void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, - const Slice& key) { +Status OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, + const Slice& key) { RecordOperation(column_family, key); - write_batch_.Delete(column_family, key); + write_batch_->Delete(column_family, key); + + return Status::OK(); } -void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, - const SliceParts& key) { +Status OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { RecordOperation(column_family, key); - write_batch_.Delete(column_family, key); + write_batch_->Delete(column_family, key); + + return Status::OK(); } -void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, - const Slice& key, - const Slice& value) { - write_batch_.Put(column_family, key, value); +Status OptimisticTransactionImpl::PutUntracked( + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key, - const SliceParts& value) { - write_batch_.Put(column_family, key, value); +Status OptimisticTransactionImpl::PutUntracked( + ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::MergeUntracked( +Status OptimisticTransactionImpl::MergeUntracked( ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - write_batch_.Merge(column_family, key, value); + write_batch_->Merge(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::DeleteUntracked( +Status OptimisticTransactionImpl::DeleteUntracked( ColumnFamilyHandle* column_family, const Slice& key) { - write_batch_.Delete(column_family, key); + write_batch_->Delete(column_family, key); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::DeleteUntracked( +Status OptimisticTransactionImpl::DeleteUntracked( ColumnFamilyHandle* column_family, const SliceParts& key) { - write_batch_.Delete(column_family, key); + write_batch_->Delete(column_family, key); + num_entries_++; + + return Status::OK(); } void OptimisticTransactionImpl::PutLogData(const Slice& blob) { - write_batch_.PutLogData(blob); + write_batch_->PutLogData(blob); + num_entries_++; } WriteBatchWithIndex* OptimisticTransactionImpl::GetWriteBatch() { - return &write_batch_; + return write_batch_.get(); } // Returns OK if it is safe to commit this transaction. Returns Status::Busy // if there are read or write conflicts that would prevent us from committing OR // if we can not determine whether there would be any such conflicts. // -// Should only be called on writer thread. +// Should only be called on writer thread in order to avoid any race conditions +// in detecting +// write conflicts. Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) { Status result; assert(dynamic_cast(db) != nullptr); auto db_impl = reinterpret_cast(db); - for (auto& tracked_keys_iter : tracked_keys_) { - uint32_t cf_id = tracked_keys_iter.first; - const auto& keys = tracked_keys_iter.second; - - SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id); - if (sv == nullptr) { - result = - Status::Busy("Could not access column family " + ToString(cf_id)); - break; - } - - SequenceNumber earliest_seq = - db_impl->GetEarliestMemTableSequenceNumber(sv, true); - - // For each of the keys in this transaction, check to see if someone has - // written to this key since the start of the transaction. - for (const auto& key_iter : keys) { - const auto& key = key_iter.first; - const SequenceNumber key_seq = key_iter.second; - - // Since it would be too slow to check the SST files, we will only use - // the memtables to check whether there have been any recent writes - // to this key after it was accessed in this transaction. But if the - // Memtables do not contain a long enough history, we must fail the - // transaction. - if (earliest_seq == kMaxSequenceNumber) { - // The age of this memtable is unknown. Cannot rely on it to check - // for recent writes. This error shouldn't happen often in practice as - // the - // Memtable should have a valid earliest sequence number except in some - // corner cases (such as error cases during recovery). - result = Status::Busy( - "Could not commit transaction with as the MemTable does not " - "countain a long enough history to check write at SequenceNumber: ", - ToString(key_seq)); - - } else if (key_seq < earliest_seq) { - // The age of this memtable is too new to use to check for recent - // writes. - char msg[255]; - snprintf( - msg, sizeof(msg), - "Could not commit transaction with write at SequenceNumber %" PRIu64 - " as the MemTable only contains changes newer than SequenceNumber " - "%" PRIu64 - ". Increasing the value of the " - "max_write_buffer_number_to_maintain option could reduce the " - "frequency " - "of this error.", - key_seq, earliest_seq); - result = Status::Busy(msg); - } else { - SequenceNumber seq = kMaxSequenceNumber; - Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq); - if (!s.ok()) { - result = s; - } else if (seq != kMaxSequenceNumber && seq > key_seq) { - result = Status::Busy(); - } - } - - if (!result.ok()) { - break; - } - } - - db_impl->ReturnAndCleanupSuperVersion(cf_id, sv); - - if (!result.ok()) { - break; - } - } - - return result; + return TransactionUtil::CheckKeysForConflicts(db_impl, &tracked_keys_); } } // namespace rocksdb diff --git a/utilities/transactions/optimistic_transaction_impl.h b/utilities/transactions/optimistic_transaction_impl.h index 30272b97b..faf6a5794 100644 --- a/utilities/transactions/optimistic_transaction_impl.h +++ b/utilities/transactions/optimistic_transaction_impl.h @@ -7,6 +7,7 @@ #ifndef ROCKSDB_LITE +#include #include #include #include @@ -16,17 +17,14 @@ #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "rocksdb/types.h" -#include "rocksdb/utilities/optimistic_transaction.h" +#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" +#include "utilities/transactions/transaction_util.h" namespace rocksdb { -using TransactionKeyMap = - std::unordered_map>; - -class OptimisticTransactionImpl : public OptimisticTransaction { +class OptimisticTransactionImpl : public Transaction { public: OptimisticTransactionImpl(OptimisticTransactionDB* db, const WriteOptions& write_options, @@ -38,6 +36,10 @@ class OptimisticTransactionImpl : public OptimisticTransaction { void Rollback() override; + void SetSavePoint() override; + + void RollbackToSavePoint() override; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) override; @@ -84,57 +86,61 @@ class OptimisticTransactionImpl : public OptimisticTransaction { keys, values); } - void Put(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; - void Put(const Slice& key, const Slice& value) override { - Put(nullptr, key, value); + Iterator* GetIterator(const ReadOptions& read_options) override; + Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); } - void Put(ColumnFamilyHandle* column_family, const SliceParts& key, - const SliceParts& value) override; - void Put(const SliceParts& key, const SliceParts& value) override { - Put(nullptr, key, value); + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); } - void Merge(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; - void Merge(const Slice& key, const Slice& value) override { - Merge(nullptr, key, value); + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); } - void Delete(ColumnFamilyHandle* column_family, const Slice& key) override; - void Delete(const Slice& key) override { Delete(nullptr, key); } - void Delete(ColumnFamilyHandle* column_family, - const SliceParts& key) override; - void Delete(const SliceParts& key) override { Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } - void PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; - void PutUntracked(const Slice& key, const Slice& value) override { - PutUntracked(nullptr, key, value); + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(const Slice& key, const Slice& value) override { + return PutUntracked(nullptr, key, value); } - void PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, - const SliceParts& value) override; - void PutUntracked(const SliceParts& key, const SliceParts& value) override { - PutUntracked(nullptr, key, value); + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status PutUntracked(const SliceParts& key, const SliceParts& value) override { + return PutUntracked(nullptr, key, value); } - void MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; - void MergeUntracked(const Slice& key, const Slice& value) override { - MergeUntracked(nullptr, key, value); + Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status MergeUntracked(const Slice& key, const Slice& value) override { + return MergeUntracked(nullptr, key, value); } - void DeleteUntracked(ColumnFamilyHandle* column_family, - const Slice& key) override; - void DeleteUntracked(const Slice& key) override { - DeleteUntracked(nullptr, key); + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(const Slice& key) override { + return DeleteUntracked(nullptr, key); } - void DeleteUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key) override; - void DeleteUntracked(const SliceParts& key) override { - DeleteUntracked(nullptr, key); + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status DeleteUntracked(const SliceParts& key) override { + return DeleteUntracked(nullptr, key); } void PutLogData(const Slice& blob) override; @@ -153,12 +159,24 @@ class OptimisticTransactionImpl : public OptimisticTransaction { const WriteOptions write_options_; const Snapshot* snapshot_; SequenceNumber start_sequence_number_; - WriteBatchWithIndex write_batch_; + const Comparator* cmp_; + std::unique_ptr write_batch_; private: - // Map of Column Family IDs to keys and their sequence numbers + // Map of Column Family IDs to keys and corresponding sequence numbers. + // The sequence number stored for a key will be used during commit to make + // sure this key has + // not changed since this sequence number. TransactionKeyMap tracked_keys_; + // Records the number of entries currently in the WriteBatch including calls + // to + // Put, Merge, Delete, and PutLogData() + size_t num_entries_ = 0; + + // Stack of number of entries in write_batch at each save point + std::unique_ptr> save_points_; + friend class OptimisticTransactionCallback; // Returns OK if it is safe to commit this transaction. Returns Status::Busy diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index b610a9ba8..09b2ee1d6 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -8,7 +8,7 @@ #include #include "rocksdb/db.h" -#include "rocksdb/utilities/optimistic_transaction.h" +#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "util/logging.h" #include "util/testharness.h" @@ -34,7 +34,6 @@ class OptimisticTransactionTest : public testing::Test { assert(s.ok()); db = txn_db->GetBaseDB(); } - ~OptimisticTransactionTest() { delete txn_db; DestroyDB(dbname, options); @@ -50,7 +49,7 @@ TEST_F(OptimisticTransactionTest, SuccessTest) { db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar")); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->GetForUpdate(read_options, "foo", &value); @@ -79,7 +78,7 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest) { db->Put(write_options, "foo", "bar"); db->Put(write_options, "foo2", "bar"); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->Put("foo", "bar2"); @@ -114,8 +113,7 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest2) { db->Put(write_options, "foo2", "bar"); txn_options.set_snapshot = true; - OptimisticTransaction* txn = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txn); // This Put outside of a transaction will conflict with a later write @@ -150,8 +148,7 @@ TEST_F(OptimisticTransactionTest, ReadConflictTest) { db->Put(write_options, "foo2", "bar"); txn_options.set_snapshot = true; - OptimisticTransaction* txn = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txn); txn->SetSnapshot(); @@ -188,7 +185,7 @@ TEST_F(OptimisticTransactionTest, TxnOnlyTest) { string value; Status s; - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->Put("x", "y"); @@ -208,7 +205,7 @@ TEST_F(OptimisticTransactionTest, FlushTest) { db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar")); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); snapshot_read_options.snapshot = txn->GetSnapshot(); @@ -248,7 +245,7 @@ TEST_F(OptimisticTransactionTest, FlushTest2) { db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar")); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); snapshot_read_options.snapshot = txn->GetSnapshot(); @@ -302,7 +299,7 @@ TEST_F(OptimisticTransactionTest, NoSnapshotTest) { db->Put(write_options, "AAA", "bar"); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); // Modify key after transaction start @@ -333,7 +330,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) { db->Put(write_options, "BBB", "bar"); db->Put(write_options, "CCC", "bar"); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); db->Put(write_options, "AAA", "bar1"); @@ -410,8 +407,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) { OptimisticTransactionOptions txn_options; txn_options.set_snapshot = true; - OptimisticTransaction* txn2 = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); txn2->SetSnapshot(); // This should not conflict in txn since the snapshot is later than the @@ -467,15 +463,14 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) { ASSERT_OK(s); db = txn_db->GetBaseDB(); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->SetSnapshot(); snapshot_read_options.snapshot = txn->GetSnapshot(); txn_options.set_snapshot = true; - OptimisticTransaction* txn2 = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txn2); // Write some data to the db @@ -594,7 +589,7 @@ TEST_F(OptimisticTransactionTest, EmptyTest) { s = db->Put(write_options, "aaa", "aaa"); ASSERT_OK(s); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); s = txn->Commit(); ASSERT_OK(s); delete txn; @@ -630,11 +625,10 @@ TEST_F(OptimisticTransactionTest, PredicateManyPreceders) { Status s; txn_options.set_snapshot = true; - OptimisticTransaction* txn1 = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options); read_options1.snapshot = txn1->GetSnapshot(); - OptimisticTransaction* txn2 = txn_db->BeginTransaction(write_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options); txn2->SetSnapshot(); read_options2.snapshot = txn2->GetSnapshot(); @@ -697,8 +691,8 @@ TEST_F(OptimisticTransactionTest, LostUpdate) { // Test 2 transactions writing to the same key in multiple orders and // with/without snapshots - OptimisticTransaction* txn1 = txn_db->BeginTransaction(write_options); - OptimisticTransaction* txn2 = txn_db->BeginTransaction(write_options); + Transaction* txn1 = txn_db->BeginTransaction(write_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options); txn1->Put("1", "1"); txn2->Put("1", "2"); @@ -792,7 +786,7 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) { Status s; // Verify transaction rollback works for untracked keys. - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); txn->PutUntracked("untracked", "0"); txn->Rollback(); s = db->Get(read_options, "untracked", &value); @@ -836,6 +830,280 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) { delete txn; } +TEST_F(OptimisticTransactionTest, IteratorTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + string value; + Status s; + + // Write some keys to the db + s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = db->Put(write_options, "G", "g"); + ASSERT_OK(s); + + s = db->Put(write_options, "F", "f"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c"); + ASSERT_OK(s); + + s = db->Put(write_options, "D", "d"); + ASSERT_OK(s); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Write some keys in a txn + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Put("H", "h"); + ASSERT_OK(s); + + s = txn->Delete("D"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + s = db->Put(write_options, "BB", "xx"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "xx"); + ASSERT_OK(s); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + s = txn->GetForUpdate(read_options, iter->key(), nullptr); + ASSERT_OK(s); + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + // key "C" was modified in the db after txn's snapshot. txn will not commit. + s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete iter; + delete txn; +} + +TEST_F(OptimisticTransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + string value; + Status s; + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->RollbackToSavePoint(); + + txn->SetSavePoint(); // 1 + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + txn->RollbackToSavePoint(); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + s = txn->Put("C", "c"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 2 + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Put("C", "cc"); + ASSERT_OK(s); + + s = txn->Put("D", "d"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to 2 + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Get(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Put("F", "f"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + s = txn->Put("G", "g"); + ASSERT_OK(s); + + s = txn->Delete("F"); + ASSERT_OK(s); + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn->RollbackToSavePoint(); // Rollback to 3 + + s = txn->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/utilities/transactions/transaction_db_impl.cc b/utilities/transactions/transaction_db_impl.cc new file mode 100644 index 000000000..84baf4b40 --- /dev/null +++ b/utilities/transactions/transaction_db_impl.cc @@ -0,0 +1,254 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "utilities/transactions/transaction_db_impl.h" + +#include "db/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/transaction_impl.h" + +namespace rocksdb { + +TransactionDBImpl::TransactionDBImpl(DB* db, + const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + txn_db_options_(txn_db_options), + lock_mgr_(txn_db_options_.num_stripes, txn_db_options.max_num_locks) {} + +Transaction* TransactionDBImpl::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options) { + Transaction* txn = new TransactionImpl(this, write_options, txn_options); + + return txn; +} + +TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options) { + TransactionDBOptions validated = txn_db_options; + + if (txn_db_options.num_stripes == 0) { + validated.num_stripes = 1; + } + + return validated; +} + +Status TransactionDB::Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = TransactionDB::Open(db_options, txn_db_options, dbname, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status TransactionDB::Open( + const DBOptions& db_options, const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, TransactionDB** dbptr) { + Status s; + DB* db; + + std::vector column_families_copy = column_families; + + // Enable MemTable History if not already enabled + for (auto& column_family : column_families_copy) { + ColumnFamilyOptions* options = &column_family.options; + + if (options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to max_write_buffer_number. + options->max_write_buffer_number_to_maintain = -1; + } + } + + s = DB::Open(db_options, dbname, column_families, handles, &db); + + if (s.ok()) { + TransactionDBImpl* txn_db = new TransactionDBImpl( + db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options)); + + for (auto cf_ptr : *handles) { + txn_db->AddColumnFamily(cf_ptr); + } + + *dbptr = txn_db; + } + + return s; +} + +// Let TransactionLockMgr know that this column family exists so it can +// allocate a LockMap for it. +void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) { + lock_mgr_.AddColumnFamily(handle->GetID()); +} + +Status TransactionDBImpl::CreateColumnFamily( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->CreateColumnFamily(options, column_family_name, handle); + if (s.ok()) { + lock_mgr_.AddColumnFamily((*handle)->GetID()); + } + + return s; +} + +// Let TransactionLockMgr know that it can deallocate the LockMap for this +// column family. +Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamily(column_family); + if (s.ok()) { + lock_mgr_.RemoveColumnFamily(column_family->GetID()); + } + + return s; +} + +Status TransactionDBImpl::TryLock(TransactionImpl* txn, uint32_t cfh_id, + const std::string& key) { + return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv()); +} + +void TransactionDBImpl::UnLock(TransactionImpl* txn, TransactionKeyMap* keys) { + lock_mgr_.UnLock(txn, keys, GetEnv()); +} + +void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id, + const std::string& key) { + lock_mgr_.UnLock(txn, cfh_id, key, GetEnv()); +} + +// Used when wrapping DB write operations in a transaction +Transaction* TransactionDBImpl::BeginInternalTransaction( + const WriteOptions& options) { + TransactionOptions txn_options; + Transaction* txn = BeginTransaction(options, txn_options); + + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + // Use default timeout for non-transactional writes + txn_impl->SetLockTimeout(txn_db_options_.default_lock_timeout); + + return txn; +} + +// All user Put, Merge, Delete, and Write requests must be intercepted to make +// sure that they lock all keys that they are writing to avoid causing conflicts +// with any concurent transactions. The easiest way to do this is to wrap all +// write operations in a transaction. +// +// Put(), Merge(), and Delete() only lock a single key per call. Write() will +// sort its keys before locking them. This guarantees that TransactionDB write +// methods cannot deadlock with eachother (but still could deadlock with a +// Transaction). +Status TransactionDBImpl::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + Status s; + + Transaction* txn = BeginInternalTransaction(options); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do PutUntracked(). + s = txn->PutUntracked(column_family, key, val); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status TransactionDBImpl::Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s; + + Transaction* txn = BeginInternalTransaction(wopts); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // DeleteUntracked(). + s = txn->DeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status TransactionDBImpl::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s; + + Transaction* txn = BeginInternalTransaction(options); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // MergeUntracked(). + s = txn->MergeUntracked(column_family, key, value); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) { + // Need to lock all keys in this batch to prevent write conflicts with + // concurrent transactions. + Transaction* txn = BeginInternalTransaction(opts); + + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + // Since commitBatch sorts the keys before locking, concurrent Write() + // operations will not cause a deadlock. + // In order to avoid a deadlock with a concurrent Transaction, Transactions + // should use a lock timeout. + Status s = txn_impl->CommitBatch(updates); + + delete txn; + + return s; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_db_impl.h b/utilities/transactions/transaction_db_impl.h new file mode 100644 index 000000000..c4b69d29e --- /dev/null +++ b/utilities/transactions/transaction_db_impl.h @@ -0,0 +1,80 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/transaction_impl.h" +#include "utilities/transactions/transaction_lock_mgr.h" + +namespace rocksdb { + +class TransactionDBImpl : public TransactionDB { + public: + explicit TransactionDBImpl(DB* db, + const TransactionDBOptions& txn_db_options); + + ~TransactionDBImpl() {} + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options) override; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + using StackableDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + using StackableDB::CreateColumnFamily; + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + using StackableDB::DropColumnFamily; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + + Status TryLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key); + + void UnLock(TransactionImpl* txn, TransactionKeyMap* keys); + void UnLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key); + + void AddColumnFamily(const ColumnFamilyHandle* handle); + + static TransactionDBOptions ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options); + + const TransactionDBOptions& GetTxnDBOptions() const { + return txn_db_options_; + } + + private: + const TransactionDBOptions txn_db_options_; + TransactionLockMgr lock_mgr_; + + // Must be held when adding/dropping column families. + InstrumentedMutex column_family_mutex_; + Transaction* BeginInternalTransaction(const WriteOptions& options); + Status WriteHelper(WriteBatch* updates, TransactionImpl* txn_impl); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_impl.cc b/utilities/transactions/transaction_impl.cc new file mode 100644 index 000000000..1bbdfcac2 --- /dev/null +++ b/utilities/transactions/transaction_impl.cc @@ -0,0 +1,598 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_impl.h" + +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/string_util.h" +#include "utilities/transactions/transaction_db_impl.h" +#include "utilities/transactions/transaction_util.h" + +namespace rocksdb { + +struct WriteOptions; + +std::atomic TransactionImpl::txn_id_counter_(1); + +TransactionID TransactionImpl::GenTxnID() { + return txn_id_counter_.fetch_add(1); +} + +TransactionImpl::TransactionImpl(TransactionDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : db_(txn_db), + txn_db_impl_(nullptr), + txn_id_(GenTxnID()), + write_options_(write_options), + snapshot_(nullptr), + cmp_(GetColumnFamilyUserComparator(txn_db->DefaultColumnFamily())), + write_batch_(new WriteBatchWithIndex(cmp_, 0, true)), + start_time_( + txn_options.expiration >= 0 ? db_->GetEnv()->NowMicros() / 1000 : 0), + expiration_time_(txn_options.expiration >= 0 + ? start_time_ + txn_options.expiration + : 0), + lock_timeout_(txn_options.lock_timeout) { + txn_db_impl_ = dynamic_cast(txn_db); + assert(txn_db_impl_); + + if (lock_timeout_ < 0) { + // Lock timeout not set, use default + lock_timeout_ = txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout; + } + + if (txn_options.set_snapshot) { + SetSnapshot(); + } +} + +TransactionImpl::~TransactionImpl() { + Cleanup(); + + if (snapshot_ != nullptr) { + db_->ReleaseSnapshot(snapshot_); + } +} + +void TransactionImpl::SetSnapshot() { + if (snapshot_ != nullptr) { + db_->ReleaseSnapshot(snapshot_); + } + + snapshot_ = db_->GetSnapshot(); +} + +void TransactionImpl::Cleanup() { + write_batch_->Clear(); + num_entries_ = 0; + txn_db_impl_->UnLock(this, &tracked_keys_); + tracked_keys_.clear(); + save_points_.reset(nullptr); +} + +bool TransactionImpl::IsExpired() const { + if (expiration_time_ > 0) { + if (db_->GetEnv()->NowMicros() >= expiration_time_ * 1000) { + // Transaction is expired. + return true; + } + } + + return false; +} + +Status TransactionImpl::CommitBatch(WriteBatch* batch) { + TransactionKeyMap keys_to_unlock; + + Status s = LockBatch(batch, &keys_to_unlock); + + if (s.ok()) { + s = DoCommit(batch); + + txn_db_impl_->UnLock(this, &keys_to_unlock); + } + + return s; +} + +Status TransactionImpl::Commit() { + Status s = DoCommit(write_batch_->GetWriteBatch()); + + Cleanup(); + + return s; +} + +Status TransactionImpl::DoCommit(WriteBatch* batch) { + Status s; + + // Do write directly on base db as TransctionDB::Write() would attempt to + // do conflict checking that we've already done. + DB* db = db_->GetBaseDB(); + + if (expiration_time_ > 0) { + // We cannot commit a transaction that is expired as its locks might have + // been released. + // To avoid race conditions, we need to use a WriteCallback to check the + // expiration time once we're on the writer thread. + TransactionCallback callback(this); + + assert(dynamic_cast(db) != nullptr); + auto db_impl = reinterpret_cast(db); + s = db_impl->WriteWithCallback(write_options_, batch, &callback); + } else { + s = db->Write(write_options_, batch); + } + + return s; +} + +void TransactionImpl::Rollback() { Cleanup(); } + +void TransactionImpl::SetSavePoint() { + if (num_entries_ > 0) { + // If transaction is empty, no need to record anything. + + if (save_points_ == nullptr) { + save_points_.reset(new std::stack()); + } + save_points_->push(num_entries_); + } +} + +void TransactionImpl::RollbackToSavePoint() { + size_t savepoint_entries = 0; + + if (save_points_ != nullptr && save_points_->size() > 0) { + savepoint_entries = save_points_->top(); + save_points_->pop(); + } + + assert(savepoint_entries <= num_entries_); + + if (savepoint_entries == num_entries_) { + // No changes to rollback + } else if (savepoint_entries == 0) { + // Rollback everything + Rollback(); + } else { + assert(dynamic_cast(db_->GetBaseDB()) != nullptr); + auto db_impl = reinterpret_cast(db_->GetBaseDB()); + + WriteBatchWithIndex* new_batch = new WriteBatchWithIndex(cmp_, 0, true); + Status s = TransactionUtil::CopyFirstN( + savepoint_entries, write_batch_.get(), new_batch, db_impl); + if (!s.ok()) { + // TODO: Should we change this function to return a Status or should we + // somehow make it so RollbackToSavePoint() can never fail?? Not easy to + // handle the case where a client accesses a column family that's been + // dropped. + // After chatting with Siying, I'm going to send a diff that adds + // savepoint support in WriteBatchWithIndex and let reviewers decide which + // approach is cleaner. + fprintf(stderr, "STATUS: %s \n", s.ToString().c_str()); + delete new_batch; + } else { + write_batch_.reset(new_batch); + } + + num_entries_ = savepoint_entries; + } +} + +// Lock all keys in this batch. +// On success, caller should unlock keys_to_unlock +Status TransactionImpl::LockBatch(WriteBatch* batch, + TransactionKeyMap* keys_to_unlock) { + class Handler : public WriteBatch::Handler { + public: + // Sorted map of column_family_id to sorted set of keys. + // Since LockBatch() always locks keys in sorted order, it cannot deadlock + // with itself. We're not using a comparator here since it doesn't matter + // what the sorting is as long as it's consistent. + std::map> keys_; + + Handler() {} + + void RecordKey(uint32_t column_family_id, const Slice& key) { + std::string key_str = key.ToString(); + + auto iter = (keys_)[column_family_id].find(key_str); + if (iter == (keys_)[column_family_id].end()) { + // key not yet seen, store it. + (keys_)[column_family_id].insert({std::move(key_str)}); + } + } + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + }; + + // Iterating on this handler will add all keys in this batch into keys + Handler handler; + batch->Iterate(&handler); + + Status s; + + // Attempt to lock all keys + for (const auto& cf_iter : handler.keys_) { + uint32_t cfh_id = cf_iter.first; + auto& cfh_keys = cf_iter.second; + + for (const auto& key_iter : cfh_keys) { + const std::string& key = key_iter; + + s = txn_db_impl_->TryLock(this, cfh_id, key); + if (!s.ok()) { + break; + } + (*keys_to_unlock)[cfh_id].insert({std::move(key), kMaxSequenceNumber}); + } + + if (!s.ok()) { + break; + } + } + + if (!s.ok()) { + txn_db_impl_->UnLock(this, keys_to_unlock); + } + + return s; +} + +Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, + const SliceParts& key, bool check_snapshot) { + size_t key_size = 0; + for (int i = 0; i < key.num_parts; ++i) { + key_size += key.parts[i].size(); + } + + std::string str; + str.reserve(key_size); + + for (int i = 0; i < key.num_parts; ++i) { + str.append(key.parts[i].data(), key.parts[i].size()); + } + + return TryLock(column_family, str, check_snapshot); +} + +// Attempt to lock this key. +// Returns OK if the key has been successfully locked. Non-ok, otherwise. +// If check_shapshot is true and this transaction has a snapshot set, +// this key will only be locked if there have been no writes to this key since +// the snapshot time. +Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool check_snapshot) { + uint32_t cfh_id = GetColumnFamilyID(column_family); + std::string key_str = key.ToString(); + bool previously_locked; + Status s; + + // lock this key if this transactions hasn't already locked it + auto iter = tracked_keys_[cfh_id].find(key_str); + if (iter == tracked_keys_[cfh_id].end()) { + previously_locked = false; + + s = txn_db_impl_->TryLock(this, cfh_id, key_str); + + if (s.ok()) { + // Record that we've locked this key + auto result = tracked_keys_[cfh_id].insert({key_str, kMaxSequenceNumber}); + iter = result.first; + } + } else { + previously_locked = true; + } + + if (s.ok()) { + // If a snapshot is set, we need to make sure the key hasn't been modified + // since the snapshot. This must be done after we locked the key. + if (!check_snapshot || snapshot_ == nullptr) { + // Need to remember the earliest sequence number that we know that this + // key has not been modified after. This is useful if this same + // transaction + // later tries to lock this key again. + if (iter->second == kMaxSequenceNumber) { + // Since we haven't checked a snapshot, we only know this key has not + // been modified since after we locked it. + iter->second = db_->GetLatestSequenceNumber(); + } + } else { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + bool already_validated = iter->second <= snapshot_->GetSequenceNumber(); + + if (!already_validated) { + s = CheckKeySequence(column_family, key); + + if (s.ok()) { + // Record that there have been no writes to this key after this + // sequence. + iter->second = snapshot_->GetSequenceNumber(); + } else { + // Failed to validate key + if (!previously_locked) { + // Unlock key we just locked + txn_db_impl_->UnLock(this, cfh_id, key.ToString()); + tracked_keys_[cfh_id].erase(iter); + } + } + } + } + } + + return s; +} + +// Return OK() if this key has not been modified more recently than the +// transaction snapshot_. +Status TransactionImpl::CheckKeySequence(ColumnFamilyHandle* column_family, + const Slice& key) { + Status result; + if (snapshot_ != nullptr) { + assert(dynamic_cast(db_->GetBaseDB()) != nullptr); + auto db_impl = reinterpret_cast(db_->GetBaseDB()); + + ColumnFamilyHandle* cfh = column_family ? column_family : + db_impl->DefaultColumnFamily(); + + result = TransactionUtil::CheckKeyForConflicts( + db_impl, cfh, key.ToString(), + snapshot_->GetSequenceNumber()); + } + + return result; +} + +Status TransactionImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key, + value); +} + +Status TransactionImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) { + Status s = TryLock(column_family, key); + + if (s.ok() && value != nullptr) { + s = Get(read_options, column_family, key, value); + } + return s; +} + +std::vector TransactionImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + size_t num_keys = keys.size(); + values->resize(num_keys); + + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + std::string* value = values ? &(*values)[i] : nullptr; + stat_list[i] = Get(read_options, column_family[i], keys[i], value); + } + + return stat_list; +} + +std::vector TransactionImpl::MultiGetForUpdate( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + // Regardless of whether the MultiGet succeeded, track these keys. + size_t num_keys = keys.size(); + values->resize(num_keys); + + // Lock all keys + for (size_t i = 0; i < num_keys; ++i) { + Status s = TryLock(column_family[i], keys[i]); + if (!s.ok()) { + // Fail entire multiget if we cannot lock all keys + return std::vector(num_keys, s); + } + } + + // TODO(agiardullo): optimize multiget? + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + std::string* value = values ? &(*values)[i] : nullptr; + stat_list[i] = Get(read_options, column_family[i], keys[i], value); + } + + return stat_list; +} + +Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options) { + Iterator* db_iter = db_->NewIterator(read_options); + assert(db_iter); + + return write_batch_->NewIteratorWithBase(db_iter); +} + +Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + Iterator* db_iter = db_->NewIterator(read_options, column_family); + assert(db_iter); + + return write_batch_->NewIteratorWithBase(column_family, db_iter); +} + +Status TransactionImpl::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Merge(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Delete(ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + // Even though we do not care about doing conflict checking for this write, + // we still need to take a lock to make sure we do not cause a conflict with + // some other write. However, we do not need to check if there have been + // any writes since this transaction's snapshot. + bool check_snapshot = false; + + // TODO(agiardullo): could optimize by supporting shared txn locks in the + // future + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Merge(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +void TransactionImpl::PutLogData(const Slice& blob) { + write_batch_->PutLogData(blob); + num_entries_++; +} + +WriteBatchWithIndex* TransactionImpl::GetWriteBatch() { + return write_batch_.get(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_impl.h b/utilities/transactions/transaction_impl.h new file mode 100644 index 000000000..c30c9f1b7 --- /dev/null +++ b/utilities/transactions/transaction_impl.h @@ -0,0 +1,263 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "utilities/transactions/transaction_util.h" + +namespace rocksdb { + +using TransactionID = uint64_t; + +class TransactionDBImpl; + +class TransactionImpl : public Transaction { + public: + TransactionImpl(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + + virtual ~TransactionImpl(); + + Status Commit() override; + + Status CommitBatch(WriteBatch* batch); + + void Rollback() override; + + void SetSavePoint() override; + + void RollbackToSavePoint() override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) override; + + Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override { + return Get(options, db_->DefaultColumnFamily(), key, value); + } + + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) override; + + Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value) override { + return GetForUpdate(options, db_->DefaultColumnFamily(), key, value); + } + + std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) override { + return MultiGet(options, std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override { + return MultiGetForUpdate(options, + std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + Iterator* GetIterator(const ReadOptions& read_options) override; + Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(const Slice& key, const Slice& value) override { + return PutUntracked(nullptr, key, value); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status PutUntracked(const SliceParts& key, const SliceParts& value) override { + return PutUntracked(nullptr, key, value); + } + + Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status MergeUntracked(const Slice& key, const Slice& value) override { + return MergeUntracked(nullptr, key, value); + } + + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(const Slice& key) override { + return DeleteUntracked(nullptr, key); + } + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status DeleteUntracked(const SliceParts& key) override { + return DeleteUntracked(nullptr, key); + } + + void PutLogData(const Slice& blob) override; + + const Snapshot* GetSnapshot() const override { return snapshot_; } + + void SetSnapshot() override; + + WriteBatchWithIndex* GetWriteBatch() override; + + // Generate a new unique transaction identifier + static TransactionID GenTxnID(); + + TransactionID GetTxnID() const { return txn_id_; } + + // Returns the time (in milliseconds according to Env->GetMicros()*1000) + // that this transaction will be expired. Returns 0 if this transaction does + // not expire. + uint64_t GetExpirationTime() const { return expiration_time_; } + + // returns true if this transaction has an expiration_time and has expired. + bool IsExpired() const; + + // Returns the number of milliseconds a transaction can wait on acquiring a + // lock or -1 if there is no timeout. + int64_t GetLockTimeout() const { return lock_timeout_; } + void SetLockTimeout(int64_t timeout) { lock_timeout_ = timeout; } + + private: + TransactionDB* const db_; + + TransactionDBImpl* txn_db_impl_; + + // Used to create unique ids for transactions. + static std::atomic txn_id_counter_; + + // Unique ID for this transaction + const TransactionID txn_id_; + + const WriteOptions write_options_; + + // If snapshot_ is set, all keys that locked must also have not been written + // since this snapshot + const Snapshot* snapshot_; + + const Comparator* cmp_; + + std::unique_ptr write_batch_; + + // If expiration_ is non-zero, start_time_ stores that time the txn was + // constructed, + // in milliseconds. + const uint64_t start_time_; + + // If non-zero, this transaction should not be committed after this time (in + // milliseconds) + const uint64_t expiration_time_; + + // Timeout in microseconds when locking a key or -1 if there is no timeout. + int64_t lock_timeout_; + + // Map from column_family_id to map of keys to Sequence Numbers. Stores keys + // that have been locked. + // The key is known to not have been modified after the Sequence Number + // stored. + TransactionKeyMap tracked_keys_; + + // Records the number of entries currently in the WriteBatch include calls to + // PutLogData() + size_t num_entries_ = 0; + + // Stack of number of entries in write_batch at each save point + std::unique_ptr> save_points_; + + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool check_snapshot = true); + Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key, + bool check_snapshot = true); + void Cleanup(); + + Status CheckKeySequence(ColumnFamilyHandle* column_family, const Slice& key); + + Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock); + + Status DoCommit(WriteBatch* batch); + + void RollbackLastN(size_t num); + + // No copying allowed + TransactionImpl(const TransactionImpl&); + void operator=(const TransactionImpl&); +}; + +// Used at commit time to check whether transaction is committing before its +// expiration time. +class TransactionCallback : public WriteCallback { + public: + explicit TransactionCallback(TransactionImpl* txn) : txn_(txn) {} + + Status Callback(DB* db) override { + if (txn_->IsExpired()) { + return Status::TimedOut(); + } else { + return Status::OK(); + } + } + + private: + TransactionImpl* txn_; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc new file mode 100644 index 000000000..b6cc9eb79 --- /dev/null +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -0,0 +1,443 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "utilities/transactions/transaction_lock_mgr.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/slice.h" +#include "util/autovector.h" +#include "util/murmurhash.h" +#include "util/thread_local.h" + +namespace rocksdb { + +struct LockInfo { + TransactionID txn_id; + uint64_t + expiration_time; // Transaction locks are not valid after this time in ms + LockInfo(TransactionID id, uint64_t time) + : txn_id(id), expiration_time(time) {} + LockInfo(const LockInfo& lock_info) + : txn_id(lock_info.txn_id), expiration_time(lock_info.expiration_time) {} +}; + +struct LockMapStripe { + // Mutex must be held before modifying keys map + std::timed_mutex stripe_mutex; + + // Condition Variable per stripe for waiting on a lock + std::condition_variable_any stripe_cv; + + // Locked keys mapped to the info about the transactions that locked them. + // TODO(agiardullo): Explore performance of other data structures. + std::unordered_map keys; +}; + +// Map of #num_stripes LockMapStripes +struct LockMap { + explicit LockMap(size_t num_stripes) + : num_stripes_(num_stripes), lock_map_stripes_(num_stripes) {} + + LockMap(const LockMap& lock_map) + : num_stripes_(lock_map.num_stripes_), lock_map_stripes_(num_stripes_) {} + + // Number of sepearate LockMapStripes to create, each with their own Mutex + const size_t num_stripes_; + + // Count of keys that are currently locked in this column family. + // (Only maintained if TransactionLockMgr::max_num_locks_ is positive.) + std::atomic lock_cnt{0}; + + std::vector lock_map_stripes_; + + size_t GetStripe(const std::string& key) const; +}; + +namespace { +void UnrefLockMapsCache(void* ptr) { + // Called when a thread exits or a ThreadLocalPtr gets destroyed. + auto lock_maps_cache = + static_cast>*>(ptr); + delete lock_maps_cache; +} +} // anonymous namespace + +TransactionLockMgr::TransactionLockMgr(size_t default_num_stripes, + int64_t max_num_locks) + : default_num_stripes_(default_num_stripes), + max_num_locks_(max_num_locks), + lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)) {} + +TransactionLockMgr::~TransactionLockMgr() {} + +size_t LockMap::GetStripe(const std::string& key) const { + assert(num_stripes_ > 0); + static murmur_hash hash; + size_t stripe = hash(key) % num_stripes_; + return stripe; +} + +void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) { + InstrumentedMutexLock l(&lock_map_mutex_); + + if (lock_maps_.find(column_family_id) == lock_maps_.end()) { + lock_maps_.emplace( + column_family_id, + std::shared_ptr(new LockMap(default_num_stripes_))); + } else { + // column_family already exists in lock map + assert(false); + } +} + +void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) { + // Remove lock_map for this column family. Since the lock map is stored + // as a shared ptr, concurrent transactions can still keep keep using it + // until they release their reference to it. + { + InstrumentedMutexLock l(&lock_map_mutex_); + + auto lock_maps_iter = lock_maps_.find(column_family_id); + assert(lock_maps_iter != lock_maps_.end()); + + lock_maps_.erase(lock_maps_iter); + } // lock_map_mutex_ + + // Clear all thread-local caches + autovector local_caches; + lock_maps_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast(cache); + } +} + +// Look up the LockMap shared_ptr for a given column_family_id. +// Note: The LockMap is only valid as long as the caller is still holding on +// to the returned shared_ptr. +std::shared_ptr TransactionLockMgr::GetLockMap( + uint32_t column_family_id) { + // First check thread-local cache + if (lock_maps_cache_->Get() == nullptr) { + lock_maps_cache_->Reset(new LockMaps()); + } + + auto lock_maps_cache = static_cast(lock_maps_cache_->Get()); + + auto lock_map_iter = lock_maps_cache->find(column_family_id); + if (lock_map_iter != lock_maps_cache->end()) { + // Found lock map for this column family. + return lock_map_iter->second; + } + + // Not found in local cache, grab mutex and check shared LockMaps + InstrumentedMutexLock l(&lock_map_mutex_); + + lock_map_iter = lock_maps_.find(column_family_id); + if (lock_map_iter == lock_maps_.end()) { + return std::shared_ptr(nullptr); + } else { + // Found lock map. Store in thread-local cache and return. + std::shared_ptr& lock_map = lock_map_iter->second; + lock_maps_cache->insert({column_family_id, lock_map}); + + return lock_map; + } +} + +// Returns true if this lock has expired and can be acquired by another +// transaction. +// If false, returns the number of microseconds until expiration in +// *wait_time_us, or 0 if no expiration. +bool TransactionLockMgr::IsLockExpired(const LockInfo& lock_info, Env* env, + uint64_t* wait_time_us) { + auto now = env->NowMicros(); + + bool expired = (lock_info.expiration_time > 0 && + lock_info.expiration_time * 1000 <= now); + + if (!expired && lock_info.expiration_time > 0 && wait_time_us != nullptr) { + // return how many microseconds until lock will be expired + *wait_time_us = (lock_info.expiration_time * 1000 - now); + } + + return expired; +} + +Status TransactionLockMgr::TryLock(const TransactionImpl* txn, + uint32_t column_family_id, + const std::string& key, Env* env) { + // Lookup lock map for this column family id + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + char msg[255]; + snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32, + column_family_id); + + return Status::InvalidArgument(msg); + } + + // Need to lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); + + LockInfo lock_info(txn->GetTxnID(), txn->GetExpirationTime()); + int64_t timeout = txn->GetLockTimeout(); + + return AcquireWithTimeout(lock_map, stripe, key, env, timeout, lock_info); +} + +// Helper function for TryLock(). +Status TransactionLockMgr::AcquireWithTimeout(LockMap* lock_map, + LockMapStripe* stripe, + const std::string& key, Env* env, + int64_t timeout, + const LockInfo& lock_info) { + std::chrono::system_clock::time_point end_time; + + if (timeout > 0) { + end_time = + std::chrono::system_clock::now() + std::chrono::milliseconds(timeout); + } + + bool locked = true; + if (timeout == 0) { + // If timeout is 0, we do not wait to acquire the lock if it is not + // available + locked = stripe->stripe_mutex.try_lock(); + } else if (timeout < 0) { + // If timeout is negative, we wait indefinitely to acquire the lock + stripe->stripe_mutex.lock(); + } else { + // If timeout is positive, we attempt to acquire the lock unless we timeout + locked = stripe->stripe_mutex.try_lock_until(end_time); + } + + if (!locked) { + // timeout acquiring mutex + return Status::Busy(); + } + + // Acquire lock if we are able to + uint64_t wait_time_us = 0; + Status result = + AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us); + + if (result.IsBusy() && timeout != 0) { + // If we weren't able to acquire the lock, we will keep retrying as long + // as the + // timeout allows. + bool timed_out = false; + do { + // Check to see if the lock expires sooner than our timeout. + std::chrono::system_clock::time_point wait_time_end; + if (wait_time_us > 0 && + (timeout < 0 || + wait_time_us < static_cast(timeout * 1000))) { + wait_time_end = std::chrono::system_clock::now() + + std::chrono::microseconds(wait_time_us); + if (timeout > 0 && wait_time_end >= end_time) { + // lock expiration time is after our timeout. + wait_time_us = 0; + } + } else { + wait_time_us = 0; + } + + if (wait_time_us > 0) { + // Wait up to the locks current expiration time + stripe->stripe_cv.wait_until(stripe->stripe_mutex, wait_time_end); + } else if (timeout > 0) { + // Wait until we timeout + auto cv_status = + stripe->stripe_cv.wait_until(stripe->stripe_mutex, end_time); + + if (cv_status == std::cv_status::timeout) { + timed_out = true; + // Even though we timed out, we will still make one more attempt to + // acquire lock below (it is possible the lock expired and we + // were never signaled). + } + } else { + // No wait timeout. + stripe->stripe_cv.wait(stripe->stripe_mutex); + } + + result = + AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us); + } while (result.IsBusy() && !timed_out); + } + + stripe->stripe_mutex.unlock(); + + return result; +} + +// Try to lock this key after we have acquired the mutex. +// Returns the number of microseconds until expiration in *wait_time_us, +// or 0 if no expiration. +// REQUIRED: Stripe mutex must be held. +Status TransactionLockMgr::AcquireLocked(LockMap* lock_map, + LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& txn_lock_info, + uint64_t* wait_time_us) { + Status result; + // Check if this key is already locked + if (stripe->keys.find(key) != stripe->keys.end()) { + // Lock already held + + LockInfo& lock_info = stripe->keys.at(key); + if (lock_info.txn_id != txn_lock_info.txn_id) { + // locked by another txn. Check if it's expired + if (IsLockExpired(lock_info, env, wait_time_us)) { + // lock is expired, can steal it + lock_info.txn_id = txn_lock_info.txn_id; + lock_info.expiration_time = txn_lock_info.expiration_time; + // lock_cnt does not change + } else { + result = Status::Busy(); + } + } + } else { // Lock not held. + // Check lock limit + if (max_num_locks_ > 0 && + lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) { + result = + Status::Busy("Failed to acquire lock due to max_num_locks limit"); + } else { + // acquire lock + stripe->keys.insert({key, txn_lock_info}); + + // Maintain lock count if there is a limit on the number of locks + if (max_num_locks_) { + lock_map->lock_cnt++; + } + } + } + + return result; +} + +void TransactionLockMgr::UnLock(TransactionImpl* txn, uint32_t column_family_id, + const std::string& key, Env* env) { + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + // Column Family must have been dropped. + return; + } + + // Lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); + + TransactionID txn_id = txn->GetTxnID(); + { + std::lock_guard lock(stripe->stripe_mutex); + + const auto& iter = stripe->keys.find(key); + if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) { + // Found the key we locked. unlock it. + stripe->keys.erase(iter); + if (max_num_locks_ > 0) { + // Maintain lock count if there is a limit on the number of locks. + assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); + lock_map->lock_cnt--; + } + } else { + // This key is either not locked or locked by someone else. This should + // only happen if the unlocking transaction has expired. + assert(txn->GetExpirationTime() > 0 && + txn->GetExpirationTime() * 1000 < env->NowMicros()); + } + } // stripe_mutex unlocked + + // Signal waiting threads to retry locking + stripe->stripe_cv.notify_all(); +} + +void TransactionLockMgr::UnLock(const TransactionImpl* txn, + const TransactionKeyMap* key_map, Env* env) { + TransactionID txn_id = txn->GetTxnID(); + + for (auto& key_map_iter : *key_map) { + uint32_t column_family_id = key_map_iter.first; + auto& keys = key_map_iter.second; + + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + + if (lock_map == nullptr) { + // Column Family must have been dropped. + return; + } + + // Bucket keys by lock_map_ stripe + std::unordered_map> keys_by_stripe( + std::max(keys.size(), lock_map->num_stripes_)); + + for (auto& key_iter : keys) { + const std::string& key = key_iter.first; + + size_t stripe_num = lock_map->GetStripe(key); + keys_by_stripe[stripe_num].push_back(&key); + } + + // For each stripe, grab the stripe mutex and unlock all keys in this stripe + for (auto& stripe_iter : keys_by_stripe) { + size_t stripe_num = stripe_iter.first; + auto& stripe_keys = stripe_iter.second; + + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); + + { + std::lock_guard lock(stripe->stripe_mutex); + + for (const std::string* key : stripe_keys) { + const auto& iter = stripe->keys.find(*key); + if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) { + // Found the key we locked. unlock it. + stripe->keys.erase(iter); + if (max_num_locks_ > 0) { + // Maintain lock count if there is a limit on the number of locks. + assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); + lock_map->lock_cnt--; + } + } else { + // This key is either not locked or locked by someone else. This + // should only + // happen if the unlocking transaction has expired. + assert(txn->GetExpirationTime() > 0 && + txn->GetExpirationTime() * 1000 < env->NowMicros()); + } + } + } // stripe_mutex unlocked + + // Signal waiting threads to retry locking + stripe->stripe_cv.notify_all(); + } + } +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h new file mode 100644 index 000000000..7768496a2 --- /dev/null +++ b/utilities/transactions/transaction_lock_mgr.h @@ -0,0 +1,90 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "rocksdb/utilities/transaction.h" +#include "util/instrumented_mutex.h" +#include "util/thread_local.h" +#include "utilities/transactions/transaction_impl.h" + +namespace rocksdb { + +class ColumnFamilyHandle; +struct LockInfo; +struct LockMap; +struct LockMapStripe; + +class Slice; + +class TransactionLockMgr { + public: + TransactionLockMgr(size_t default_num_stripes, int64_t max_num_locks); + + ~TransactionLockMgr(); + + // Creates a new LockMap for this column family. Caller should guarantee + // that this column family does not already exist. + void AddColumnFamily(uint32_t column_family_id); + + // Deletes the LockMap for this column family. Caller should guarantee that + // this column family is no longer in use. + void RemoveColumnFamily(uint32_t column_family_id); + + // Attempt to lock key. If OK status is returned, the caller is responsible + // for calling UnLock() on this key. + Status TryLock(const TransactionImpl* txn, uint32_t column_family_id, + const std::string& key, Env* env); + + // Unlock a key locked by TryLock(). txn must be the same Transaction that + // locked this key. + void UnLock(const TransactionImpl* txn, const TransactionKeyMap* keys, + Env* env); + void UnLock(TransactionImpl* txn, uint32_t column_family_id, + const std::string& key, Env* env); + + private: + // Default number of lock map stripes per column family + const size_t default_num_stripes_; + + // Limit on number of keys locked per column family + const int64_t max_num_locks_; + + // Must be held when accessing/modifying lock_maps_ + InstrumentedMutex lock_map_mutex_; + + // Map of ColumnFamilyId to locked key info + using LockMaps = std::unordered_map>; + LockMaps lock_maps_; + + // Thread-local cache of entries in lock_maps_. This is an optimization + // to avoid acquiring a mutex in order to look up a LockMap + std::unique_ptr lock_maps_cache_; + + bool IsLockExpired(const LockInfo& lock_info, Env* env, uint64_t* wait_time); + + std::shared_ptr GetLockMap(uint32_t column_family_id); + + Status AcquireWithTimeout(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, int64_t timeout, + const LockInfo& lock_info); + + Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& lock_info, uint64_t* wait_time); + + // No copying allowed + TransactionLockMgr(const TransactionLockMgr&); + void operator=(const TransactionLockMgr&); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc new file mode 100644 index 000000000..8aef74ffd --- /dev/null +++ b/utilities/transactions/transaction_test.cc @@ -0,0 +1,1587 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/logging.h" +#include "util/testharness.h" + +using std::string; + +namespace rocksdb { + +class TransactionTest : public testing::Test { + public: + TransactionDB* db; + string dbname; + Options options; + + TransactionDBOptions txn_db_options; + + TransactionTest() { + options.create_if_missing = true; + options.max_write_buffer_number = 2; + dbname = test::TmpDir() + "/transaction_testdb"; + + DestroyDB(dbname, options); + txn_db_options.transaction_lock_timeout = 0; + txn_db_options.default_lock_timeout = 0; + Status s = TransactionDB::Open(options, txn_db_options, dbname, &db); + assert(s.ok()); + } + + ~TransactionTest() { + delete db; + DestroyDB(dbname, options); + } +}; + +TEST_F(TransactionTest, SuccessTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + db->Put(write_options, Slice("foo"), Slice("bar")); + db->Put(write_options, Slice("foo2"), Slice("bar")); + + Transaction* txn = db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn); + + s = txn->GetForUpdate(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_F(TransactionTest, WriteConflictTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + db->Put(write_options, "foo", "A"); + db->Put(write_options, "foo2", "B"); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("foo", "A2"); + ASSERT_OK(s); + + s = txn->Put("foo2", "B2"); + ASSERT_OK(s); + + // This Put outside of a transaction will conflict with the previous write + s = db->Put(write_options, "foo", "xxx"); + ASSERT_NOK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A"); + + s = txn->Commit(); + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A2"); + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "B2"); + + delete txn; +} + +TEST_F(TransactionTest, WriteConflictTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + db->Put(write_options, "foo", "bar"); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + // This Put outside of a transaction will conflict with a later write + s = db->Put(write_options, "foo", "barz"); + ASSERT_OK(s); + + s = txn->Put("foo2", "X"); + ASSERT_OK(s); + + s = txn->Put("foo", + "bar2"); // Conflicts with write done after snapshot taken + ASSERT_NOK(s); + + s = txn->Put("foo3", "Y"); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + s = txn->Commit(); + ASSERT_OK(s); // Txn should commit, but only write foo2 and foo3 + + // Verify that transaction wrote foo2 and foo3 but not foo + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "X"); + + db->Get(read_options, "foo3", &value); + ASSERT_EQ(value, "Y"); + + delete txn; +} + +TEST_F(TransactionTest, ReadConflictTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + db->Put(write_options, "foo", "bar"); + db->Put(write_options, "foo2", "bar"); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + // This Put outside of a transaction will conflict with the previous read + s = db->Put(write_options, "foo", "barz"); + ASSERT_NOK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_F(TransactionTest, TxnOnlyTest) { + // Test to make sure transactions work when there are no other writes in an + // empty db. + + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("x", "y"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_F(TransactionTest, FlushTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + string value; + Status s; + + db->Put(write_options, Slice("foo"), Slice("bar")); + db->Put(write_options, Slice("foo2"), Slice("bar")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a memtable to flush + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + // force a memtable flush + FlushOptions flush_ops; + db->Flush(flush_ops); + + s = txn->Commit(); + // txn should commit since the flushed table is still in MemtableList History + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_F(TransactionTest, FlushTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + db->Put(write_options, Slice("foo"), Slice("bar")); + db->Put(write_options, Slice("foo2"), Slice("bar")); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a MemTable to flush + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + // force a memtable flush + FlushOptions flush_ops; + db->Flush(flush_ops); + + // Put a random key so we have a MemTable to flush + s = db->Put(write_options, "dummy", "dummy2"); + ASSERT_OK(s); + + // force a memtable flush + db->Flush(flush_ops); + + s = db->Put(write_options, "dummy", "dummy3"); + ASSERT_OK(s); + + // force a memtable flush + // Since our test db has max_write_buffer_number=2, this flush will cause + // the first memtable to get purged from the MemtableList history. + db->Flush(flush_ops); + + s = txn->Put("X", "Y"); + ASSERT_NOK(s); // Put should fail since MemTableList History is not older + // than snapshot. + + s = txn->Commit(); + ASSERT_OK(s); + + // Transaction should only write the keys that succeeded. + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_F(TransactionTest, NoSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + db->Put(write_options, "AAA", "bar"); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Modify key after transaction start + db->Put(write_options, "AAA", "bar1"); + + // Read and write without a snapshot + txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Should commit since read/write was done after data changed + s = txn->Commit(); + ASSERT_OK(s); + + txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_F(TransactionTest, MultipleSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + string value; + Status s; + + db->Put(write_options, "AAA", "bar"); + db->Put(write_options, "BBB", "bar"); + db->Put(write_options, "CCC", "bar"); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + db->Put(write_options, "AAA", "bar1"); + + // Read and write without a snapshot + txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Modify BBB before snapshot is taken + db->Put(write_options, "BBB", "bar1"); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + txn->GetForUpdate(snapshot_read_options, "BBB", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("BBB", "bar2"); + ASSERT_OK(s); + + db->Put(write_options, "CCC", "bar1"); + + // Set a new snapshot + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + txn->GetForUpdate(snapshot_read_options, "CCC", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("CCC", "bar2"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + // verify that we track multiple writes to the same key at different snapshots + delete txn; + txn = db->BeginTransaction(write_options); + + // Potentially conflicting writes + db->Put(write_options, "ZZZ", "zzz"); + db->Put(write_options, "XXX", "xxx"); + + txn->SetSnapshot(); + + TransactionOptions txn_options; + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + txn2->SetSnapshot(); + + // This should not conflict in txn since the snapshot is later than the + // previous write (spoiler alert: it will later conflict with txn2). + s = txn->Put("ZZZ", "zzzz"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // This will conflict since the snapshot is earlier than another write to ZZZ + s = txn2->Put("ZZZ", "xxxxx"); + ASSERT_NOK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "ZZZ", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + delete txn2; +} + +TEST_F(TransactionTest, ColumnFamiliesTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + delete cfa; + delete cfb; + delete db; + + // open DB with three column families + std::vector column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + + std::vector handles; + + s = TransactionDB::Open(options, txn_db_options, dbname, column_families, + &handles, &db); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // Write some data to the db + WriteBatch batch; + batch.Put("foo", "foo"); + batch.Put(handles[1], "AAA", "bar"); + batch.Put(handles[1], "AAAZZZ", "bar"); + s = db->Write(write_options, &batch); + ASSERT_OK(s); + db->Delete(write_options, handles[1], "AAAZZZ"); + + // These keys do not conflict with existing writes since they're in + // different column families + s = txn->Delete("AAA"); + ASSERT_OK(s); + s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + Slice key_slice("AAAZZZ"); + Slice value_slices[2] = {Slice("bar"), Slice("bar")}; + s = txn->Put(handles[2], SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(read_options, handles[2], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")}; + Slice value_slice("barbarbar"); + + s = txn2->Delete(handles[2], "XXX"); + ASSERT_OK(s); + s = txn2->Delete(handles[1], "XXX"); + ASSERT_OK(s); + + // This write will cause a conflict with the earlier batch write + s = txn2->Put(handles[1], SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + ASSERT_NOK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, handles[1], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + delete txn; + delete txn2; + + txn = db->BeginTransaction(write_options, txn_options); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + std::vector multiget_cfh = {handles[1], handles[2], + handles[0], handles[2]}; + std::vector multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"}; + std::vector values(4); + + std::vector results = txn->MultiGetForUpdate( + snapshot_read_options, multiget_cfh, multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + s = txn->Delete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYY"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYYY"); + ASSERT_OK(s); + s = txn->Delete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "AAAZZZ", "barbarbar"); + ASSERT_OK(s); + + // Txn should commit + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, handles[2], "ZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Put a key which will conflict with the next txn using the previous snapshot + db->Put(write_options, handles[2], "foo", "000"); + + results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh, + multiget_keys, &values); + // All results should fail since there was a conflict + ASSERT_NOK(results[0]); + ASSERT_NOK(results[1]); + ASSERT_NOK(results[2]); + ASSERT_NOK(results[3]); + + s = db->Get(read_options, handles[2], "foo", &value); + ASSERT_EQ(value, "000"); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->DropColumnFamily(handles[1]); + ASSERT_OK(s); + s = db->DropColumnFamily(handles[2]); + ASSERT_OK(s); + + delete txn; + delete txn2; + + for (auto handle : handles) { + delete handle; + } +} + +TEST_F(TransactionTest, ColumnFamiliesTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + ColumnFamilyHandle *one, *two; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "ONE", &one); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "TWO", &two); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + Transaction* txn2 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn2); + + s = txn1->Put(one, "X", "1"); + ASSERT_OK(s); + s = txn1->Put(two, "X", "2"); + ASSERT_OK(s); + s = txn1->Put("X", "0"); + ASSERT_OK(s); + + s = txn2->Put(one, "X", "11"); + ASSERT_TRUE(s.IsBusy()); + + s = txn1->Commit(); + ASSERT_OK(s); + + // Drop first column family + s = db->DropColumnFamily(one); + ASSERT_OK(s); + + // Should fail since column family was dropped. + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + + // Should fail since column family was dropped + s = txn1->Put(one, "X", "111"); + ASSERT_TRUE(s.IsInvalidArgument()); + + s = txn1->Put(two, "X", "222"); + ASSERT_OK(s); + + s = txn1->Put("X", "000"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, two, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("222", value); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("000", value); + + s = db->DropColumnFamily(two); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + delete one; + delete two; +} + +TEST_F(TransactionTest, EmptyTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + txn->Rollback(); + delete txn; + + txn = db->BeginTransaction(write_options); + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + // Conflicts with previous GetForUpdate + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_NOK(s); + + // transaction expired! + s = txn->Commit(); + ASSERT_OK(s); + delete txn; +} + +TEST_F(TransactionTest, PredicateManyPreceders) { + WriteOptions write_options; + ReadOptions read_options1, read_options2; + TransactionOptions txn_options; + string value; + Status s; + + txn_options.set_snapshot = true; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + Transaction* txn2 = db->BeginTransaction(write_options); + txn2->SetSnapshot(); + read_options2.snapshot = txn2->GetSnapshot(); + + std::vector multiget_keys = {"1", "2", "3"}; + std::vector multiget_values; + + std::vector results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[1].IsNotFound()); + + s = txn2->Put("2", "x"); // Conflict's with txn1's MultiGetForUpdate + ASSERT_NOK(s); + + txn2->Rollback(); + + multiget_values.clear(); + results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[1].IsNotFound()); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("4", "x"); + ASSERT_OK(s); + + s = txn2->Delete("4"); // conflict + ASSERT_NOK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options2, "4", &value); + ASSERT_TRUE(s.IsBusy()); + + txn2->Rollback(); + + delete txn1; + delete txn2; +} + +TEST_F(TransactionTest, LostUpdate) { + WriteOptions write_options; + ReadOptions read_options, read_options1, read_options2; + TransactionOptions txn_options; + string value; + Status s; + + // Test 2 transactions writing to the same key in multiple orders and + // with/without snapshots + + Transaction* txn1 = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "1"); + ASSERT_OK(s); + + s = txn2->Put("1", "2"); // conflict + ASSERT_NOK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("1", value); + + delete txn1; + delete txn2; + + txn_options.set_snapshot = true; + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "3"); + ASSERT_OK(s); + s = txn2->Put("1", "4"); // conflict + ASSERT_NOK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "5"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "6"); + ASSERT_NOK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "7"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + txn2->SetSnapshot(); + s = txn2->Put("1", "8"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("8", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options); + txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "9"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "10"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "10"); +} + +TEST_F(TransactionTest, UntrackedWrites) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + // Verify transaction rollback works for untracked keys. + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->PutUntracked("untracked", "0"); + ASSERT_OK(s); + txn->Rollback(); + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = db->Put(write_options, "untracked", "x"); + ASSERT_OK(s); + + // Untracked writes should succeed even though key was written after snapshot + s = txn->PutUntracked("untracked", "1"); + ASSERT_OK(s); + s = txn->MergeUntracked("untracked", "2"); + ASSERT_OK(s); + s = txn->DeleteUntracked("untracked"); + ASSERT_OK(s); + + // Conflict + s = txn->Put("untracked", "3"); + ASSERT_NOK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_F(TransactionTest, ExpiredTransaction) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + // Set txn expiration timeout to 0 microseconds (expires instantly) + txn_options.expiration = 0; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + s = txn1->Put("Y", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should be able to write to X since txn1 has expired + s = txn2->Put("X", "2"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("2", value); + + s = txn1->Put("Z", "1"); + ASSERT_OK(s); + + // txn1 should fail to commit since it is expired + s = txn1->Commit(); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Get(read_options, "Y", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "Z", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn1; + delete txn2; +} + +TEST_F(TransactionTest, Rollback) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + ASSERT_OK(s); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should not be able to write to X since txn1 has it locked + s = txn2->Put("X", "2"); + ASSERT_TRUE(s.IsBusy()); + + txn1->Rollback(); + delete txn1; + + // txn2 should now be able to write to X + s = txn2->Put("X", "3"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn2; +} + +TEST_F(TransactionTest, LockLimitTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + delete db; + + // Open DB with a lock limit of 3 + txn_db_options.max_num_locks = 3; + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + // Create a txn and verify we can only lock up to 3 keys + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("X", "x"); + ASSERT_OK(s); + + s = txn->Put("Y", "y"); + ASSERT_OK(s); + + s = txn->Put("Z", "z"); + ASSERT_OK(s); + + // lock limit reached + s = txn->Put("W", "w"); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->Put("X", "xx"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "W", &value); + ASSERT_TRUE(s.IsBusy()); + s = txn->GetForUpdate(read_options, "V", &value); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->GetForUpdate(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = txn->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + Transaction* txn2 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn2); + + // lock limit reached + s = txn2->Put("X", "x"); + ASSERT_TRUE(s.IsBusy()); + + // lock limit reached + s = txn2->Put("M", "m"); + ASSERT_TRUE(s.IsBusy()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("xx", value); + + s = db->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Committing txn should release its locks and allow txn2 to proceed + s = txn2->Put("X", "x2"); + ASSERT_OK(s); + + s = txn2->Delete("X"); + ASSERT_OK(s); + + s = txn2->Put("M", "m"); + ASSERT_OK(s); + + s = txn2->Put("Z", "z2"); + ASSERT_OK(s); + + // lock limit reached + s = txn2->Delete("Y"); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ("z2", value); + + s = db->Get(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = db->Get(read_options, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + delete txn2; +} + +TEST_F(TransactionTest, IteratorTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + // Write some keys to the db + s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = db->Put(write_options, "G", "g"); + ASSERT_OK(s); + + s = db->Put(write_options, "F", "f"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c"); + ASSERT_OK(s); + + s = db->Put(write_options, "D", "d"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Write some keys in a txn + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Put("H", "h"); + ASSERT_OK(s); + + s = txn->Delete("D"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + s = db->Put(write_options, "BB", "xx"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "xx"); + ASSERT_OK(s); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + s = txn->GetForUpdate(read_options, iter->key(), nullptr); + if (i == 2) { + // "C" was modified after txn's snapshot + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + s = txn->Commit(); + ASSERT_OK(s); + + delete iter; + delete txn; +} + +TEST_F(TransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->RollbackToSavePoint(); + + txn->SetSavePoint(); // 1 + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + txn->RollbackToSavePoint(); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + delete txn; + txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + s = txn->Put("C", "c"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 2 + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Put("C", "cc"); + ASSERT_OK(s); + + s = txn->Put("D", "d"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to 2 + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Get(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Put("F", "f"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + s = txn->Put("G", "g"); + ASSERT_OK(s); + + s = txn->Delete("F"); + ASSERT_OK(s); + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn->RollbackToSavePoint(); // Rollback to 3 + + s = txn->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_F(TransactionTest, TimeoutTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + delete db; + + // transaction writes have an infinite timeout, + // but we will override this when we start a txn + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = -1; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options0; + txn_options0.expiration = 100; // 100ms + txn_options0.lock_timeout = 50; // txn timeout no longer infinite + Transaction* txn1 = db->BeginTransaction(write_options, txn_options0); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_NOK(s); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + delete db; + + // transaction writes have 10ms timeout, + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = 50; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options; + txn_options.expiration = 100; // 100ms + txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_NOK(s); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + txn_options.expiration = 6000000; // 100 minutes + txn1 = db->BeginTransaction(write_options, txn_options); + + TransactionOptions txn_options2; + txn_options2.expiration = 10; // 10ms + Transaction* txn2 = db->BeginTransaction(write_options, txn_options2); + ASSERT_OK(s); + + s = txn2->Put("a", "2"); + ASSERT_OK(s); + + // txn1 has a lock timeout longer than txn2's expiration, so it will win + s = txn1->Delete("a"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + // txn2 should be timed out since txn1 waiting until its timeout expired. + s = txn2->Commit(); + ASSERT_TRUE(s.IsTimedOut()); + + delete txn1; + delete txn2; + txn_options.expiration = 6000000; // 100 minutes + txn1 = db->BeginTransaction(write_options, txn_options); + txn_options2.expiration = 100000000; + txn2 = db->BeginTransaction(write_options, txn_options2); + + s = txn1->Delete("asdf"); + ASSERT_OK(s); + + // txn2 has a smaller lock timeout than txn1's expiration, so it will time out + s = txn2->Delete("asdf"); + ASSERT_TRUE(s.IsBusy()); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("asdf", "asdf"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "asdf", &value); + ASSERT_OK(s); + ASSERT_EQ("asdf", value); + + delete txn1; + delete txn2; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc new file mode 100644 index 000000000..086d650ae --- /dev/null +++ b/utilities/transactions/transaction_util.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "utilities/transactions/transaction_util.h" + +#include +#include +#include + +#include "db/db_impl.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/string_util.h" + +namespace rocksdb { + +Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl, + ColumnFamilyHandle* column_family, + const std::string& key, + SequenceNumber key_seq) { + Status result; + + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd); + + if (sv == nullptr) { + result = Status::Busy("Could not access column family " + + cfh->GetName()); + } + + if (result.ok()) { + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + result = CheckKey(db_impl, sv, earliest_seq, key_seq, key); + + db_impl->ReturnAndCleanupSuperVersion(cfd, sv); + } + + return result; +} + +Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, + SequenceNumber key_seq, + const std::string& key) { + Status result; + + // Since it would be too slow to check the SST files, we will only use + // the memtables to check whether there have been any recent writes + // to this key after it was accessed in this transaction. But if the + // Memtables do not contain a long enough history, we must fail the + // transaction. + if (earliest_seq == kMaxSequenceNumber) { + // The age of this memtable is unknown. Cannot rely on it to check + // for recent writes. This error shouldn't happen often in practice as + // the + // Memtable should have a valid earliest sequence number except in some + // corner cases (such as error cases during recovery). + result = Status::Busy( + "Transaction ould not check for conflicts as the MemTable does not " + "countain a long enough history to check write at SequenceNumber: ", + ToString(key_seq)); + + } else if (key_seq < earliest_seq) { + // The age of this memtable is too new to use to check for recent + // writes. + char msg[255]; + snprintf(msg, sizeof(msg), + "Transaction could not check for conflicts for opearation at " + "SequenceNumber %" PRIu64 + " as the MemTable only contains changes newer than SequenceNumber " + "%" PRIu64 + ". Increasing the value of the " + "max_write_buffer_number_to_maintain option could reduce the " + "frequency " + "of this error.", + key_seq, earliest_seq); + result = Status::Busy(msg); + } else { + SequenceNumber seq = kMaxSequenceNumber; + Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq); + if (!s.ok()) { + result = s; + } else if (seq != kMaxSequenceNumber && seq > key_seq) { + result = Status::Busy(); + } + } + + return result; +} + +Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, + TransactionKeyMap* key_map) { + Status result; + + for (auto& key_map_iter : *key_map) { + uint32_t cf_id = key_map_iter.first; + const auto& keys = key_map_iter.second; + + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id); + if (sv == nullptr) { + result = + Status::Busy("Could not access column family " + ToString(cf_id)); + break; + } + + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + // For each of the keys in this transaction, check to see if someone has + // written to this key since the start of the transaction. + for (const auto& key_iter : keys) { + const auto& key = key_iter.first; + const SequenceNumber key_seq = key_iter.second; + + result = CheckKey(db_impl, sv, earliest_seq, key_seq, key); + + if (!result.ok()) { + break; + } + } + + db_impl->ReturnAndCleanupSuperVersion(cf_id, sv); + + if (!result.ok()) { + break; + } + } + + return result; +} + +Status TransactionUtil::CopyFirstN(size_t num, WriteBatchWithIndex* batch, + WriteBatchWithIndex* new_batch, + DBImpl* db_impl) { + // Handler for iterating through batch and copying entries to new_batch + class Handler : public WriteBatch::Handler { + public: + WriteBatchWithIndex* batch; + const size_t limit; + DBImpl* db_impl; + size_t seen = 0; + std::unordered_map super_versions; + std::unordered_map handles; + + Handler(WriteBatchWithIndex* dest, size_t new_limit, DBImpl* db) + : batch(dest), limit(new_limit), db_impl(db) {} + + ~Handler() { + for (auto& iter : super_versions) { + db_impl->ReturnAndCleanupSuperVersionUnlocked(iter.first, iter.second); + } + } + + Status GetColumnFamily(uint32_t column_family_id, + ColumnFamilyHandle** cfh) { + // Need to look up ColumnFamilyHandle for this column family id. Since + // doing this requires grabbing a mutex, lets only do it once per column + // family and cache it. + // In order to ensure that the ColumnFamilyHandle is still valid, we need + // to hold the superversion. + const auto& iter = handles.find(column_family_id); + if (iter == handles.end()) { + // Don't have ColumnFamilyHandle cached, look it up from the db. + SuperVersion* sv = + db_impl->GetAndRefSuperVersionUnlocked(column_family_id); + if (sv == nullptr) { + return Status::InvalidArgument( + "Could not find column family for ID " + + ToString(column_family_id)); + } + super_versions.insert({column_family_id, sv}); + + *cfh = db_impl->GetColumnFamilyHandleUnlocked(column_family_id); + if (*cfh == nullptr) { + return Status::InvalidArgument( + "Could not find column family handle for ID " + + ToString(column_family_id)); + } + handles.insert({column_family_id, *cfh}); + } else { + *cfh = iter->second; + } + + return Status::OK(); + } + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (seen >= limit) { + // Found the first N entries, return Aborted to stop the Iteration. + return Status::Aborted(); + } + ColumnFamilyHandle* cfh = nullptr; + Status s = GetColumnFamily(column_family_id, &cfh); + if (s.ok()) { + batch->Put(cfh, key, value); + } + seen++; + return s; + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (seen >= limit) { + // Found the first N entries, return Aborted to stop the Iteration. + return Status::Aborted(); + } + ColumnFamilyHandle* cfh = nullptr; + Status s = GetColumnFamily(column_family_id, &cfh); + if (s.ok()) { + batch->Merge(cfh, key, value); + } + seen++; + return s; + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + if (seen >= limit) { + // Found the first N entries, return Aborted to stop the Iteration. + return Status::Aborted(); + } + ColumnFamilyHandle* cfh = nullptr; + Status s = GetColumnFamily(column_family_id, &cfh); + if (s.ok()) { + batch->Delete(cfh, key); + } + seen++; + return s; + } + + virtual void LogData(const Slice& blob) override { + if (seen < limit) { + batch->PutLogData(blob); + } + seen++; + } + }; + + // Iterating on this handler will add all keys in this batch into a new batch + // up to + // the limit. + Handler handler(new_batch, num, db_impl); + Status s = batch->GetWriteBatch()->Iterate(&handler); + + if (s.IsAborted()) { + // Handler returns Aborted when it is done copying to stop the iteration. + s = Status::OK(); + } + + return s; +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h new file mode 100644 index 000000000..21f69a022 --- /dev/null +++ b/utilities/transactions/transaction_util.h @@ -0,0 +1,65 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace rocksdb { + +using TransactionKeyMap = + std::unordered_map>; + +class DBImpl; +struct SuperVersion; +class WriteBatchWithIndex; + +class TransactionUtil { + public: + // Verifies there have been no writes to this key in the db since this + // sequence number. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + static Status CheckKeyForConflicts(DBImpl* db_impl, + ColumnFamilyHandle* column_family, + const std::string& key, + SequenceNumber key_seq); + + // For each key,SequenceNumber pair in the TransactionKeyMap, this function + // will verify there have been no writes to the key in the db since that + // sequence number. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + // + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. + static Status CheckKeysForConflicts(DBImpl* db_impl, TransactionKeyMap* keys); + + // Copies the first num entries from batch into new_batch (including Put, + // Merge, Delete, and PutLogData). + // Returns non-OK on error. + static Status CopyFirstN(size_t num, WriteBatchWithIndex* batch, + WriteBatchWithIndex* new_batch, DBImpl* db_impl); + + private: + static Status CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, SequenceNumber key_seq, + const std::string& key); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 507aff248..9308ba39b 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -626,12 +626,15 @@ Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family, switch (result) { case WriteBatchWithIndexInternal::Result::kFound: case WriteBatchWithIndexInternal::Result::kError: - return s; + // use returned status + break; case WriteBatchWithIndexInternal::Result::kDeleted: case WriteBatchWithIndexInternal::Result::kNotFound: - return Status::NotFound(); + s = Status::NotFound(); + break; case WriteBatchWithIndexInternal::Result::kMergeInProgress: - return Status::MergeInProgress(""); + s = Status::MergeInProgress(""); + break; default: assert(false); } @@ -659,8 +662,8 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, std::string batch_value; WriteBatchWithIndexInternal::Result result = WriteBatchWithIndexInternal::GetFromBatch( - options, this, column_family, key, &merge_context, &rep->comparator, - &batch_value, &s); + options, this, column_family, key, &merge_context, + &rep->comparator, &batch_value, &s); if (result == WriteBatchWithIndexInternal::Result::kFound) { value->assign(batch_value.data(), batch_value.size());