Pessimistic Transactions

Summary:
Initial implementation of Pessimistic Transactions.  This diff contains the api changes discussed in D38913.  This diff is pretty large, so let me know if people would prefer to meet up to discuss it.

MyRocks folks:  please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.

Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint().  After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex.  We can then decide which route is preferable.

Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.

Test Plan: Unit tests, db_bench parallel testing.

Reviewers: igor, rven, sdong, yhchiang, yoshinorim

Reviewed By: sdong

Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D40869
main
agiardullo 10 years ago
parent c2868cbc52
commit c2f2cb0214
  1. 5
      CMakeLists.txt
  2. 1
      HISTORY.md
  3. 6
      Makefile
  4. 86
      db/db_bench.cc
  5. 48
      db/db_impl.cc
  6. 10
      db/db_impl.h
  7. 7
      examples/Makefile
  8. 142
      examples/optimistic_transaction_example.cc
  9. 52
      examples/transaction_example.cc
  10. 4
      include/rocksdb/status.h
  11. 233
      include/rocksdb/utilities/optimistic_transaction.h
  12. 18
      include/rocksdb/utilities/optimistic_transaction_db.h
  13. 260
      include/rocksdb/utilities/transaction.h
  14. 130
      include/rocksdb/utilities/transaction_db.h
  15. 5
      src.mk
  16. 3
      util/status.cc
  17. 4
      utilities/transactions/optimistic_transaction_db_impl.cc
  18. 2
      utilities/transactions/optimistic_transaction_db_impl.h
  19. 267
      utilities/transactions/optimistic_transaction_impl.cc
  20. 108
      utilities/transactions/optimistic_transaction_impl.h
  21. 318
      utilities/transactions/optimistic_transaction_test.cc
  22. 254
      utilities/transactions/transaction_db_impl.cc
  23. 80
      utilities/transactions/transaction_db_impl.h
  24. 598
      utilities/transactions/transaction_impl.cc
  25. 263
      utilities/transactions/transaction_impl.h
  26. 443
      utilities/transactions/transaction_lock_mgr.cc
  27. 90
      utilities/transactions/transaction_lock_mgr.h
  28. 1587
      utilities/transactions/transaction_test.cc
  29. 265
      utilities/transactions/transaction_util.cc
  30. 65
      utilities/transactions/transaction_util.h
  31. 13
      utilities/write_batch_with_index/write_batch_with_index.cc

@ -221,6 +221,10 @@ set(SOURCES
utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc
utilities/transactions/optimistic_transaction_impl.cc utilities/transactions/optimistic_transaction_impl.cc
utilities/transactions/optimistic_transaction_db_impl.cc utilities/transactions/optimistic_transaction_db_impl.cc
utilities/transactions/transaction_impl.cc
utilities/transactions/transaction_db_impl.cc
utilities/transactions/transaction_lock_mgr.cc
utilities/transactions/transaction_util.cc
utilities/ttl/db_ttl_impl.cc utilities/ttl/db_ttl_impl.cc
utilities/write_batch_with_index/write_batch_with_index.cc utilities/write_batch_with_index/write_batch_with_index.cc
utilities/write_batch_with_index/write_batch_with_index_internal.cc utilities/write_batch_with_index/write_batch_with_index_internal.cc
@ -333,6 +337,7 @@ set(TESTS
utilities/spatialdb/spatial_db_test.cc utilities/spatialdb/spatial_db_test.cc
utilities/table_properties_collectors/compact_on_deletion_collector_test.cc utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
utilities/transactions/optimistic_transaction_test.cc utilities/transactions/optimistic_transaction_test.cc
utilities/transactions/transaction_test.cc
utilities/ttl/ttl_test.cc utilities/ttl/ttl_test.cc
utilities/write_batch_with_index/write_batch_with_index_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc
) )

@ -17,6 +17,7 @@
## 3.12.0 (7/2/2015) ## 3.12.0 (7/2/2015)
### New Features ### New Features
* Added experimental support for pessimistic transactions. See include/rocksdb/utilities/transaction.h for more info.
* Added experimental support for optimistic transactions. See include/rocksdb/utilities/optimistic_transaction.h for more info. * Added experimental support for optimistic transactions. See include/rocksdb/utilities/optimistic_transaction.h for more info.
* Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds) * Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds)
* Added a cache for individual rows. See DBOptions::row_cache for more info. * Added a cache for individual rows. See DBOptions::row_cache for more info.

@ -304,7 +304,8 @@ TESTS = \
write_callback_test \ write_callback_test \
heap_test \ heap_test \
compact_on_deletion_collector_test \ compact_on_deletion_collector_test \
compaction_job_stats_test compaction_job_stats_test \
transaction_test
SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/) SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/)
@ -919,6 +920,9 @@ write_callback_test: db/write_callback_test.o $(LIBOBJECTS) $(TESTHARNESS)
heap_test: util/heap_test.o $(GTEST) heap_test: util/heap_test.o $(GTEST)
$(AM_LINK) $(AM_LINK)
transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
sst_dump: tools/sst_dump.o $(LIBOBJECTS) sst_dump: tools/sst_dump.o $(LIBOBJECTS)
$(AM_LINK) $(AM_LINK)

@ -54,7 +54,8 @@ int main() {
#include "rocksdb/slice_transform.h" #include "rocksdb/slice_transform.h"
#include "rocksdb/perf_context.h" #include "rocksdb/perf_context.h"
#include "rocksdb/utilities/flashcache.h" #include "rocksdb/utilities/flashcache.h"
#include "rocksdb/utilities/optimistic_transaction.h" #include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/transaction_db.h"
#include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/optimistic_transaction_db.h"
#include "port/port.h" #include "port/port.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
@ -448,10 +449,14 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
DEFINE_uint64(delete_obsolete_files_period_micros, 0, DEFINE_uint64(delete_obsolete_files_period_micros, 0,
"Ignored. Left here for backward compatibility"); "Ignored. Left here for backward compatibility");
DEFINE_bool(transaction_db, false, DEFINE_bool(optimistic_transaction_db, false,
"Open a OptimisticTransactionDB instance. " "Open a OptimisticTransactionDB instance. "
"Required for randomtransaction benchmark."); "Required for randomtransaction benchmark.");
DEFINE_bool(transaction_db, false,
"Open a TransactionDB instance. "
"Required for randomtransaction benchmark.");
DEFINE_uint64(transaction_sets, 2, DEFINE_uint64(transaction_sets, 2,
"Number of keys each transaction will " "Number of keys each transaction will "
"modify (use in RandomTransaction only). Max: 9999"); "modify (use in RandomTransaction only). Max: 9999");
@ -919,7 +924,7 @@ static void AppendWithSpace(std::string* str, Slice msg) {
struct DBWithColumnFamilies { struct DBWithColumnFamilies {
std::vector<ColumnFamilyHandle*> cfh; std::vector<ColumnFamilyHandle*> cfh;
DB* db; DB* db;
OptimisticTransactionDB* txn_db; OptimisticTransactionDB* opt_txn_db;
std::atomic<size_t> num_created; // Need to be updated after all the std::atomic<size_t> num_created; // Need to be updated after all the
// new entries in cfh are set. // new entries in cfh are set.
size_t num_hot; // Number of column families to be queried at each moment. size_t num_hot; // Number of column families to be queried at each moment.
@ -927,7 +932,7 @@ struct DBWithColumnFamilies {
// Column families will be created and used to be queried. // Column families will be created and used to be queried.
port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf() port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf()
DBWithColumnFamilies() : db(nullptr), txn_db(nullptr) { DBWithColumnFamilies() : db(nullptr), opt_txn_db(nullptr) {
cfh.clear(); cfh.clear();
num_created = 0; num_created = 0;
num_hot = 0; num_hot = 0;
@ -936,7 +941,7 @@ struct DBWithColumnFamilies {
DBWithColumnFamilies(const DBWithColumnFamilies& other) DBWithColumnFamilies(const DBWithColumnFamilies& other)
: cfh(other.cfh), : cfh(other.cfh),
db(other.db), db(other.db),
txn_db(other.txn_db), opt_txn_db(other.opt_txn_db),
num_created(other.num_created.load()), num_created(other.num_created.load()),
num_hot(other.num_hot) {} num_hot(other.num_hot) {}
@ -944,9 +949,9 @@ struct DBWithColumnFamilies {
std::for_each(cfh.begin(), cfh.end(), std::for_each(cfh.begin(), cfh.end(),
[](ColumnFamilyHandle* cfhi) { delete cfhi; }); [](ColumnFamilyHandle* cfhi) { delete cfhi; });
cfh.clear(); cfh.clear();
if (txn_db) { if (opt_txn_db) {
delete txn_db; delete opt_txn_db;
txn_db = nullptr; opt_txn_db = nullptr;
} else { } else {
delete db; delete db;
} }
@ -2445,11 +2450,19 @@ class Benchmark {
if (FLAGS_readonly) { if (FLAGS_readonly) {
s = DB::OpenForReadOnly(options, db_name, column_families, s = DB::OpenForReadOnly(options, db_name, column_families,
&db->cfh, &db->db); &db->cfh, &db->db);
} else if (FLAGS_transaction_db) { } else if (FLAGS_optimistic_transaction_db) {
s = OptimisticTransactionDB::Open(options, db_name, column_families, s = OptimisticTransactionDB::Open(options, db_name, column_families,
&db->cfh, &db->txn_db); &db->cfh, &db->opt_txn_db);
if (s.ok()) {
db->db = db->opt_txn_db->GetBaseDB();
}
} else if (FLAGS_transaction_db) {
TransactionDB* ptr;
TransactionDBOptions txn_db_options;
s = TransactionDB::Open(options, txn_db_options, db_name,
column_families, &db->cfh, &ptr);
if (s.ok()) { if (s.ok()) {
db->db = db->txn_db->GetBaseDB(); db->db = ptr;
} }
} else { } else {
s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
@ -2459,11 +2472,19 @@ class Benchmark {
db->num_hot = num_hot; db->num_hot = num_hot;
} else if (FLAGS_readonly) { } else if (FLAGS_readonly) {
s = DB::OpenForReadOnly(options, db_name, &db->db); s = DB::OpenForReadOnly(options, db_name, &db->db);
} else if (FLAGS_optimistic_transaction_db) {
s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
if (s.ok()) {
db->db = db->opt_txn_db->GetBaseDB();
}
} else if (FLAGS_transaction_db) { } else if (FLAGS_transaction_db) {
s = OptimisticTransactionDB::Open(options, db_name, &db->txn_db); TransactionDB* ptr;
TransactionDBOptions txn_db_options;
s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
if (s.ok()) { if (s.ok()) {
db->db = db->txn_db->GetBaseDB(); db->db = ptr;
} }
} else { } else {
s = DB::Open(options, db_name, &db->db); s = DB::Open(options, db_name, &db->db);
} }
@ -3530,7 +3551,6 @@ class Benchmark {
uint64_t transactions_aborted = 0; uint64_t transactions_aborted = 0;
Status s; Status s;
uint64_t num_prefix_ranges = FLAGS_transaction_sets; uint64_t num_prefix_ranges = FLAGS_transaction_sets;
bool use_txn = FLAGS_transaction_db;
if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) { if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
fprintf(stderr, "invalid value for transaction_sets\n"); fprintf(stderr, "invalid value for transaction_sets\n");
@ -3545,12 +3565,17 @@ class Benchmark {
} }
while (!duration.Done(1)) { while (!duration.Done(1)) {
OptimisticTransaction* txn = nullptr; Transaction* txn = nullptr;
WriteBatch* batch = nullptr; WriteBatch* batch = nullptr;
if (use_txn) { if (FLAGS_optimistic_transaction_db) {
txn = db_.txn_db->BeginTransaction(write_options_); txn = db_.opt_txn_db->BeginTransaction(write_options_);
assert(txn); assert(txn);
} else if (FLAGS_transaction_db) {
TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
TransactionOptions txn_options;
txn_options.expiration = 10000000;
txn = txn_db->BeginTransaction(write_options_, txn_options);
} else { } else {
batch = new WriteBatch(); batch = new WriteBatch();
} }
@ -3558,6 +3583,7 @@ class Benchmark {
// pick a random number to use to increment a key in each set // pick a random number to use to increment a key in each set
uint64_t incr = (thread->rand.Next() % 100) + 1; uint64_t incr = (thread->rand.Next() % 100) + 1;
bool failed = false;
// For each set, pick a key at random and increment it // For each set, pick a key at random and increment it
for (uint8_t i = 0; i < num_prefix_ranges; i++) { for (uint8_t i = 0; i < num_prefix_ranges; i++) {
uint64_t int_value; uint64_t int_value;
@ -3572,8 +3598,8 @@ class Benchmark {
std::string full_key = std::string(prefix_buf) + base_key.ToString(); std::string full_key = std::string(prefix_buf) + base_key.ToString();
Slice key(full_key); Slice key(full_key);
if (use_txn) { if (txn) {
s = txn->Get(read_options, key, &value); s = txn->GetForUpdate(read_options, key, &value);
} else { } else {
s = db->Get(read_options, key, &value); s = db->Get(read_options, key, &value);
} }
@ -3599,15 +3625,23 @@ class Benchmark {
} }
std::string sum = ToString(int_value + incr); std::string sum = ToString(int_value + incr);
if (use_txn) { if (txn) {
txn->Put(key, sum); s = txn->Put(key, sum);
if (!s.ok()) {
failed = true;
break;
}
} else { } else {
batch->Put(key, sum); batch->Put(key, sum);
} }
} }
if (use_txn) { if (txn) {
s = txn->Commit(); if (failed) {
txn->Rollback();
} else {
s = txn->Commit();
}
} else { } else {
s = db->Write(write_options_, batch); s = db->Write(write_options_, batch);
} }
@ -3616,7 +3650,7 @@ class Benchmark {
// Ideally, we'd want to run this stress test with enough concurrency // Ideally, we'd want to run this stress test with enough concurrency
// on a small enough set of keys that we get some failed transactions // on a small enough set of keys that we get some failed transactions
// due to conflicts. // due to conflicts.
if (use_txn && s.IsBusy()) { if (txn && s.IsBusy()) {
transactions_aborted++; transactions_aborted++;
} else { } else {
fprintf(stderr, "Unexpected write error: %s\n", s.ToString().c_str()); fprintf(stderr, "Unexpected write error: %s\n", s.ToString().c_str());
@ -3635,7 +3669,7 @@ class Benchmark {
} }
char msg[100]; char msg[100];
if (use_txn) { if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
snprintf(msg, sizeof(msg), snprintf(msg, sizeof(msg),
"( transactions:%" PRIu64 " aborts:%" PRIu64 ")", "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
transactions_done, transactions_aborted); transactions_done, transactions_aborted);
@ -3653,7 +3687,7 @@ class Benchmark {
// Since each iteration of RandomTransaction() incremented a key in each set // Since each iteration of RandomTransaction() incremented a key in each set
// by the same value, the sum of the keys in each set should be the same. // by the same value, the sum of the keys in each set should be the same.
void RandomTransactionVerify() { void RandomTransactionVerify() {
if (!FLAGS_transaction_db) { if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
// transactions not used, nothing to verify. // transactions not used, nothing to verify.
return; return;
} }

@ -3686,7 +3686,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
mutex_.Lock(); mutex_.Lock();
} }
if (db_options_.paranoid_checks && !status.ok() && if (db_options_.paranoid_checks && !status.ok() && !status.IsTimedOut() &&
!status.IsBusy() && bg_error_.ok()) { !status.IsBusy() && bg_error_.ok()) {
bg_error_ = status; // stop compaction & fail any further writes bg_error_ = status; // stop compaction & fail any further writes
} }
@ -3944,6 +3944,22 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
return GetAndRefSuperVersion(cfd); return GetAndRefSuperVersion(cfd);
} }
// REQUIRED: mutex is NOT held
SuperVersion* DBImpl::GetAndRefSuperVersionUnlocked(uint32_t column_family_id) {
ColumnFamilyData* cfd;
{
InstrumentedMutexLock l(&mutex_);
auto column_family_set = versions_->GetColumnFamilySet();
cfd = column_family_set->GetColumnFamily(column_family_id);
}
if (!cfd) {
return nullptr;
}
return GetAndRefSuperVersion(cfd);
}
void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
SuperVersion* sv) { SuperVersion* sv) {
bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
@ -3974,6 +3990,22 @@ void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
ReturnAndCleanupSuperVersion(cfd, sv); ReturnAndCleanupSuperVersion(cfd, sv);
} }
// REQUIRED: Mutex should NOT be held.
void DBImpl::ReturnAndCleanupSuperVersionUnlocked(uint32_t column_family_id,
SuperVersion* sv) {
ColumnFamilyData* cfd;
{
InstrumentedMutexLock l(&mutex_);
auto column_family_set = versions_->GetColumnFamilySet();
cfd = column_family_set->GetColumnFamily(column_family_id);
}
// If SuperVersion is held, and we successfully fetched a cfd using
// GetAndRefSuperVersion(), it must still exist.
assert(cfd != nullptr);
ReturnAndCleanupSuperVersion(cfd, sv);
}
// REQUIRED: this function should only be called on the write thread or if the // REQUIRED: this function should only be called on the write thread or if the
// mutex is held. // mutex is held.
ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
@ -3986,6 +4018,20 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
return cf_memtables->GetColumnFamilyHandle(); return cf_memtables->GetColumnFamilyHandle();
} }
// REQUIRED: mutex is NOT held.
ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked(
uint32_t column_family_id) {
ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
InstrumentedMutexLock l(&mutex_);
if (!cf_memtables->Seek(column_family_id)) {
return nullptr;
}
return cf_memtables->GetColumnFamilyHandle();
}
void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n, uint64_t* sizes, const Range* range, int n, uint64_t* sizes,
bool include_memtable) { bool include_memtable) {

@ -326,6 +326,9 @@ class DBImpl : public DB {
// mutex is held. // mutex is held.
SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id); SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
// Same as above, should called without mutex held and not on write thread.
SuperVersion* GetAndRefSuperVersionUnlocked(uint32_t column_family_id);
// Un-reference the super version and return it to thread local cache if // Un-reference the super version and return it to thread local cache if
// needed. If it is the last reference of the super version. Clean it up // needed. If it is the last reference of the super version. Clean it up
// after un-referencing it. // after un-referencing it.
@ -336,11 +339,18 @@ class DBImpl : public DB {
// REQUIRED: this function should only be called on the write thread. // REQUIRED: this function should only be called on the write thread.
void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv); void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
// Same as above, should called without mutex held and not on write thread.
void ReturnAndCleanupSuperVersionUnlocked(uint32_t colun_family_id,
SuperVersion* sv);
// REQUIRED: this function should only be called on the write thread or if the // REQUIRED: this function should only be called on the write thread or if the
// mutex is held. Return value only valid until next call to this function or // mutex is held. Return value only valid until next call to this function or
// mutex is released. // mutex is released.
ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
// Same as above, should called without mutex held and not on write thread.
ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id);
protected: protected:
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;

@ -2,7 +2,7 @@ include ../make_config.mk
.PHONY: clean .PHONY: clean
all: simple_example column_families_example compact_files_example c_simple_example transaction_example all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example
simple_example: simple_example.cc simple_example: simple_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
@ -19,8 +19,11 @@ compact_files_example: compact_files_example.cc
c_simple_example: c_simple_example.o c_simple_example: c_simple_example.o
$(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS) $(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS)
optimistic_transaction_example: optimistic_transaction_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
transaction_example: transaction_example.cc transaction_example: transaction_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
clean: clean:
rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./transaction_example rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example

@ -0,0 +1,142 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#include "rocksdb/db.h"
#include "rocksdb/options.h"
#include "rocksdb/slice.h"
#include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/optimistic_transaction_db.h"
using namespace rocksdb;
std::string kDBPath = "/tmp/rocksdb_transaction_example";
int main() {
// open DB
Options options;
options.create_if_missing = true;
DB* db;
OptimisticTransactionDB* txn_db;
Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db);
assert(s.ok());
db = txn_db->GetBaseDB();
WriteOptions write_options;
ReadOptions read_options;
OptimisticTransactionOptions txn_options;
std::string value;
////////////////////////////////////////////////////////
//
// Simple OptimisticTransaction Example ("Read Committed")
//
////////////////////////////////////////////////////////
// Start a transaction
Transaction* txn = txn_db->BeginTransaction(write_options);
assert(txn);
// Read a key in this transaction
s = txn->Get(read_options, "abc", &value);
assert(s.IsNotFound());
// Write a key in this transaction
txn->Put("abc", "def");
// Read a key OUTSIDE this transaction. Does not affect txn.
s = db->Get(read_options, "abc", &value);
// Write a key OUTSIDE of this transaction.
// Does not affect txn since this is an unrelated key. If we wrote key 'abc'
// here, the transaction would fail to commit.
s = db->Put(write_options, "xyz", "zzz");
// Commit transaction
s = txn->Commit();
assert(s.ok());
delete txn;
////////////////////////////////////////////////////////
//
// "Repeatable Read" (Snapshot Isolation) Example
// -- Using a single Snapshot
//
////////////////////////////////////////////////////////
// Set a snapshot at start of transaction by setting set_snapshot=true
txn_options.set_snapshot = true;
txn = txn_db->BeginTransaction(write_options, txn_options);
const Snapshot* snapshot = txn->GetSnapshot();
// Write a key OUTSIDE of transaction
db->Put(write_options, "abc", "xyz");
// Read a key using the snapshot
read_options.snapshot = snapshot;
s = txn->GetForUpdate(read_options, "abc", &value);
assert(value == "def");
// Attempt to commit transaction
s = txn->Commit();
// Transaction could not commit since the write outside of the txn conflicted
// with the read!
assert(s.IsBusy());
delete txn;
// Clear snapshot from read options since it is no longer valid
read_options.snapshot = nullptr;
snapshot = nullptr;
////////////////////////////////////////////////////////
//
// "Read Committed" (Monotonic Atomic Views) Example
// --Using multiple Snapshots
//
////////////////////////////////////////////////////////
// In this example, we set the snapshot multiple times. This is probably
// only necessary if you have very strict isolation requirements to
// implement.
// Set a snapshot at start of transaction
txn_options.set_snapshot = true;
txn = txn_db->BeginTransaction(write_options, txn_options);
// Do some reads and writes to key "x"
read_options.snapshot = db->GetSnapshot();
s = txn->Get(read_options, "x", &value);
txn->Put("x", "x");
// Do a write outside of the transaction to key "y"
s = db->Put(write_options, "y", "y");
// Set a new snapshot in the transaction
txn->SetSnapshot();
read_options.snapshot = db->GetSnapshot();
// Do some reads and writes to key "y"
s = txn->GetForUpdate(read_options, "y", &value);
txn->Put("y", "y");
// Commit. Since the snapshot was advanced, the write done outside of the
// transaction does not prevent this transaction from Committing.
s = txn->Commit();
assert(s.ok());
delete txn;
// Clear snapshot from read options since it is no longer valid
read_options.snapshot = nullptr;
// Cleanup
delete txn_db;
DestroyDB(kDBPath, options);
return 0;
}
#endif // ROCKSDB_LITE

@ -8,8 +8,8 @@
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "rocksdb/utilities/optimistic_transaction.h" #include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction_db.h"
using namespace rocksdb; using namespace rocksdb;
@ -18,17 +18,16 @@ std::string kDBPath = "/tmp/rocksdb_transaction_example";
int main() { int main() {
// open DB // open DB
Options options; Options options;
TransactionDBOptions txn_db_options;
options.create_if_missing = true; options.create_if_missing = true;
DB* db; TransactionDB* txn_db;
OptimisticTransactionDB* txn_db;
Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db); Status s = TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db);
assert(s.ok()); assert(s.ok());
db = txn_db->GetBaseDB();
WriteOptions write_options; WriteOptions write_options;
ReadOptions read_options; ReadOptions read_options;
OptimisticTransactionOptions txn_options; TransactionOptions txn_options;
std::string value; std::string value;
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
@ -38,7 +37,7 @@ int main() {
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Start a transaction // Start a transaction
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
assert(txn); assert(txn);
// Read a key in this transaction // Read a key in this transaction
@ -46,15 +45,16 @@ int main() {
assert(s.IsNotFound()); assert(s.IsNotFound());
// Write a key in this transaction // Write a key in this transaction
txn->Put("abc", "def"); s = txn->Put("abc", "def");
assert(s.ok());
// Read a key OUTSIDE this transaction. Does not affect txn. // Read a key OUTSIDE this transaction. Does not affect txn.
s = db->Get(read_options, "abc", &value); s = txn_db->Get(read_options, "abc", &value);
// Write a key OUTSIDE of this transaction. // Write a key OUTSIDE of this transaction.
// Does not affect txn since this is an unrelated key. If we wrote key 'abc' // Does not affect txn since this is an unrelated key. If we wrote key 'abc'
// here, the transaction would fail to commit. // here, the transaction would fail to commit.
s = db->Put(write_options, "xyz", "zzz"); s = txn_db->Put(write_options, "xyz", "zzz");
// Commit transaction // Commit transaction
s = txn->Commit(); s = txn->Commit();
@ -75,20 +75,17 @@ int main() {
const Snapshot* snapshot = txn->GetSnapshot(); const Snapshot* snapshot = txn->GetSnapshot();
// Write a key OUTSIDE of transaction // Write a key OUTSIDE of transaction
db->Put(write_options, "abc", "xyz"); s = txn_db->Put(write_options, "abc", "xyz");
assert(s.ok());
// Read a key using the snapshot // Attempt to read a key using the snapshot. This will fail since
// the previous write outside this txn conflicts with this read.
read_options.snapshot = snapshot; read_options.snapshot = snapshot;
s = txn->GetForUpdate(read_options, "abc", &value); s = txn->GetForUpdate(read_options, "abc", &value);
assert(value == "def");
// Attempt to commit transaction
s = txn->Commit();
// Transaction could not commit since the write outside of the txn conflicted
// with the read!
assert(s.IsBusy()); assert(s.IsBusy());
txn->Rollback();
delete txn; delete txn;
// Clear snapshot from read options since it is no longer valid // Clear snapshot from read options since it is no longer valid
read_options.snapshot = nullptr; read_options.snapshot = nullptr;
@ -110,23 +107,28 @@ int main() {
txn = txn_db->BeginTransaction(write_options, txn_options); txn = txn_db->BeginTransaction(write_options, txn_options);
// Do some reads and writes to key "x" // Do some reads and writes to key "x"
read_options.snapshot = db->GetSnapshot(); read_options.snapshot = txn_db->GetSnapshot();
s = txn->Get(read_options, "x", &value); s = txn->Get(read_options, "x", &value);
txn->Put("x", "x"); txn->Put("x", "x");
// Do a write outside of the transaction to key "y" // Do a write outside of the transaction to key "y"
s = db->Put(write_options, "y", "y"); s = txn_db->Put(write_options, "y", "y");
// Set a new snapshot in the transaction // Set a new snapshot in the transaction
txn->SetSnapshot(); txn->SetSnapshot();
read_options.snapshot = db->GetSnapshot(); txn->SetSavePoint();
read_options.snapshot = txn_db->GetSnapshot();
// Do some reads and writes to key "y" // Do some reads and writes to key "y"
// Since the snapshot was advanced, the write done outside of the
// transaction does not conflict.
s = txn->GetForUpdate(read_options, "y", &value); s = txn->GetForUpdate(read_options, "y", &value);
txn->Put("y", "y"); txn->Put("y", "y");
// Commit. Since the snapshot was advanced, the write done outside of the // Decide we want to revert the last write from this transaction.
// transaction does not prevent this transaction from Committing. txn->RollbackToSavePoint();
// Commit.
s = txn->Commit(); s = txn->Commit();
assert(s.ok()); assert(s.ok());
delete txn; delete txn;

@ -80,6 +80,10 @@ class Status {
static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) { static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kBusy, msg, msg2); return Status(kBusy, msg, msg2);
} }
static Status TimedOut() { return Status(kTimedOut); }
static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kTimedOut, msg, msg2);
}
// Returns true iff the status indicates success. // Returns true iff the status indicates success.
bool ok() const { return code() == kOk; } bool ok() const { return code() == kOk; }

@ -1,233 +0,0 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/status.h"
namespace rocksdb {
class OptimisticTransactionDB;
class WriteBatchWithIndex;
// Provides BEGIN/COMMIT/ROLLBACK transactions for batched writes.
//
// The current implementation provides optimistic concurrency control.
// Transactional reads/writes will not block other operations in the
// db. At commit time, the batch of writes will only be written if there have
// been no other writes to any keys read or written by this transaction.
// Otherwise, the commit will return an error.
//
// A new optimistic transaction is created by calling
// OptimisticTransactionDB::BeginTransaction().
// Only reads/writes done through this transaction object will be a part of the
// transaction. Any other reads/writes will not be tracked by this
// transaction.
//
// For example, reading data via OptimisticTransaction::GetForUpdate() will
// prevent the transaction from committing if this key is written to outside of
// this transaction. Any reads done via DB::Get() will not be checked for
// conflicts at commit time.
//
// It is up to the caller to synchronize access to this object.
//
// See examples/transaction_example.cc for some simple examples.
//
// TODO(agiardullo): Not yet implemented:
// -Transaction support for iterators
// -Ensuring memtable holds large enough history to check for conflicts
// -Support for using Transactions with DBWithTTL
// Options to use when starting an Optimistic Transaction
struct OptimisticTransactionOptions {
// Setting set_snapshot=true is the same as calling SetSnapshot().
bool set_snapshot = false;
// Should be set if the DB has a non-default comparator.
// See comment in WriteBatchWithIndex constructor.
const Comparator* cmp = BytewiseComparator();
};
class OptimisticTransaction {
public:
virtual ~OptimisticTransaction() {}
// If SetSnapshot() is not called, all keys read/written through this
// transaction will only be committed if there have been no writes to
// these keys outside of this transaction *since the time each key
// was first read/written* in this transaction.
//
// When SetSnapshot() is called, this transaction will create a Snapshot
// to use for conflict validation of all future operations in the transaction.
// All future keys read/written will only be committed if there have been
// no writes to these keys outside of this transaction *since SetSnapshot()
// was called.* Otherwise, Commit() will not succeed.
//
// It is not necessary to call SetSnapshot() if you only care about other
// writes happening on keys *after* they have first been read/written in this
// transaction. However, you should set a snapshot if you are concerned
// with any other writes happening since a particular time (such as
// the start of the transaction).
//
// SetSnapshot() may be called multiple times if you would like to change
// the snapshot used for different operations in this transaction.
//
// Calling SetSnapshot will not affect the version of Data returned by Get()
// methods. See OptimisticTransaction::Get() for more details.
//
// TODO(agiardullo): add better documentation here once memtable change are
// committed
virtual void SetSnapshot() = 0;
// Returns the Snapshot created by the last call to SetSnapshot().
//
// REQUIRED: The returned Snapshot is only valid up until the next time
// SetSnapshot() is called or the OptimisticTransaction is deleted.
virtual const Snapshot* GetSnapshot() const = 0;
// Write all batched keys to the db atomically if there have not been any
// other writes performed on the keys read/written by this transaction.
//
// Currently, Commit() only checks the memtables to verify that there are no
// other writes to these keys. If the memtable's history is not long
// enough to verify that there are no conflicts, Commit() will return
// a non-OK status.
//
// Returns OK on success, non-OK on failure.
virtual Status Commit() = 0;
// Discard all batched writes in this transaction.
virtual void Rollback() = 0;
// This function is similar to DB::Get() except it will also read pending
// changes in this transaction.
//
// If read_options.snapshot is not set, the current version of the key will
// be read. Calling SetSnapshot() does not affect the version of the data
// returned.
//
// Note that setting read_options.snapshot will affect what is read from the
// DB but will NOT change which keys are read from this transaction (the keys
// in this transaction do not yet belong to any snapshot and will be fetched
// regardless).
//
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) = 0;
virtual Status Get(const ReadOptions& options, const Slice& key,
std::string* value) = 0;
virtual std::vector<Status> MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values) = 0;
// Read this key and ensure that this transaction will only
// be able to be committed if this key is not written outside this
// transaction after it has first been read (or after the snapshot if a
// snapshot is set in this transaction).
// This function is similar to OptimisticTransaction::Get() except it will
// affect whether this transaction will be able to be committed.
virtual Status GetForUpdate(const ReadOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key, std::string* value) = 0;
virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
std::string* value) = 0;
virtual std::vector<Status> MultiGetForUpdate(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
virtual std::vector<Status> MultiGetForUpdate(
const ReadOptions& options, const std::vector<Slice>& keys,
std::vector<std::string>* values) = 0;
// Put, Merge, and Delete behave similarly to their corresponding
// functions in WriteBatch. In addition, this transaction will only
// be able to be committed if these keys are not written outside of this
// transaction after they have been written by this transaction (or after the
// snapshot if a snapshot is set in this transaction).
virtual void Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0;
virtual void Put(const Slice& key, const Slice& value) = 0;
virtual void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) = 0;
virtual void Put(const SliceParts& key, const SliceParts& value) = 0;
virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0;
virtual void Merge(const Slice& key, const Slice& value) = 0;
virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key) = 0;
virtual void Delete(const Slice& key) = 0;
virtual void Delete(ColumnFamilyHandle* column_family,
const SliceParts& key) = 0;
virtual void Delete(const SliceParts& key) = 0;
// PutUntracked() will write a Put to the batch of operations to be committed
// in this transaction. This write will only happen if this transaction
// gets committed successfully. But unlike OptimisticTransaction::Put(),
// no conflict checking will be done for this key. So any other writes to
// this key outside of this transaction will not prevent this transaction from
// committing.
virtual void PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0;
virtual void PutUntracked(const Slice& key, const Slice& value) = 0;
virtual void PutUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key, const SliceParts& value) = 0;
virtual void PutUntracked(const SliceParts& key, const SliceParts& value) = 0;
virtual void MergeUntracked(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) = 0;
virtual void MergeUntracked(const Slice& key, const Slice& value) = 0;
virtual void DeleteUntracked(ColumnFamilyHandle* column_family,
const Slice& key) = 0;
virtual void DeleteUntracked(const Slice& key) = 0;
virtual void DeleteUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key) = 0;
virtual void DeleteUntracked(const SliceParts& key) = 0;
// Similar to WriteBatch::PutLogData
virtual void PutLogData(const Slice& blob) = 0;
// Fetch the underlying write batch that contains all pending changes to be
// committed.
//
// Note: You should not write or delete anything from the batch directly and
// should only use the the functions in the OptimisticTransaction class to
// write to this transaction.
virtual WriteBatchWithIndex* GetWriteBatch() = 0;
protected:
// To begin a new transaction, see OptimisticTransactionDB::BeginTransaction()
explicit OptimisticTransaction(const OptimisticTransactionDB* db) {}
OptimisticTransaction() {}
private:
// No copying allowed
OptimisticTransaction(const OptimisticTransaction&);
void operator=(const OptimisticTransaction&);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -11,16 +11,25 @@
#include "rocksdb/comparator.h" #include "rocksdb/comparator.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/utilities/optimistic_transaction.h"
namespace rocksdb { namespace rocksdb {
class OptimisticTransaction; class Transaction;
// Database with Transaction support. // Database with Transaction support.
// //
// See optimistic_transaction.h and examples/transaction_example.cc // See optimistic_transaction.h and examples/transaction_example.cc
// Options to use when starting an Optimistic Transaction
struct OptimisticTransactionOptions {
// Setting set_snapshot=true is the same as calling SetSnapshot().
bool set_snapshot = false;
// Should be set if the DB has a non-default comparator.
// See comment in WriteBatchWithIndex constructor.
const Comparator* cmp = BytewiseComparator();
};
class OptimisticTransactionDB { class OptimisticTransactionDB {
public: public:
// Open an OptimisticTransactionDB similar to DB::Open(). // Open an OptimisticTransactionDB similar to DB::Open().
@ -34,13 +43,12 @@ class OptimisticTransactionDB {
virtual ~OptimisticTransactionDB() {} virtual ~OptimisticTransactionDB() {}
// Starts a new OptimisticTransaction. Passing set_snapshot=true has the same // Starts a new Transaction. Passing set_snapshot=true has the same effect
// effect
// as calling SetSnapshot(). // as calling SetSnapshot().
// //
// Caller should delete the returned transaction after calling // Caller should delete the returned transaction after calling
// Commit() or Rollback(). // Commit() or Rollback().
virtual OptimisticTransaction* BeginTransaction( virtual Transaction* BeginTransaction(
const WriteOptions& write_options, const WriteOptions& write_options,
const OptimisticTransactionOptions& const OptimisticTransactionOptions&
txn_options = OptimisticTransactionOptions()) = 0; txn_options = OptimisticTransactionOptions()) = 0;

@ -0,0 +1,260 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/status.h"
namespace rocksdb {
class Iterator;
class TransactionDB;
class WriteBatchWithIndex;
// Provides BEGIN/COMMIT/ROLLBACK transactions.
//
// To use transactions, you must first create either an OptimisticTransactionDB
// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
// more information.
//
// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
//
// It is up to the caller to synchronize access to this object.
//
// See examples/transaction_example.cc for some simple examples.
//
// TODO(agiardullo): Not yet implemented
// -PerfContext statistics
// -Support for using Transactions with DBWithTTL
class Transaction {
public:
virtual ~Transaction() {}
// If a transaction has a snapshot set, the transaction will ensure that
// any keys successfully written(or fetched via GetForUpdate()) have not
// been modified outside of this transaction since the time the snapshot was
// set.
// If a snapshot has not been set, the transaction guarantees that keys have
// not been modified since the time each key was first written (or fetched via
// GetForUpdate()).
//
// Using SetSnapshot() will provide stricter isolation guarantees at the
// expense of potentially more transaction failures due to conflicts with
// other writes.
//
// Calling SetSnapshot() has no effect on keys written before this function
// has been called.
//
// SetSnapshot() may be called multiple times if you would like to change
// the snapshot used for different operations in this transaction.
//
// Calling SetSnapshot will not affect the version of Data returned by Get()
// methods. See Transaction::Get() for more details.
virtual void SetSnapshot() = 0;
// Returns the Snapshot created by the last call to SetSnapshot().
//
// REQUIRED: The returned Snapshot is only valid up until the next time
// SetSnapshot() is called or the Transaction is deleted.
virtual const Snapshot* GetSnapshot() const = 0;
// Write all batched keys to the db atomically.
//
// Returns OK on success.
//
// May return any error status that could be returned by DB:Write().
//
// If this transaction was created by an OptimisticTransactionDB(),
// Status::Busy() may be returned if the transaction could not guarantee
// that there are no write conflicts.
//
// If this transaction was created by a TransactionDB(), Status::TimedOut()
// may be returned if this transaction has lived for longer than
// TransactionOptions.expiration.
virtual Status Commit() = 0;
// Discard all batched writes in this transaction.
virtual void Rollback() = 0;
// Records the state of the transaction for future calls to
// RollbackToSavePoint(). May be called multiple times to set multiple save
// points.
virtual void SetSavePoint() = 0;
// Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
// since the
// most recent call to SetSavePoint() and removes the most recent
// SetSavePoint().
// If there is no previous call to SetSavePoint(), behaves the same as
// Rollback()
virtual void RollbackToSavePoint() = 0;
// This function is similar to DB::Get() except it will also read pending
// changes in this transaction.
//
// If read_options.snapshot is not set, the current version of the key will
// be read. Calling SetSnapshot() does not affect the version of the data
// returned.
//
// Note that setting read_options.snapshot will affect what is read from the
// DB but will NOT change which keys are read from this transaction (the keys
// in this transaction do not yet belong to any snapshot and will be fetched
// regardless).
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) = 0;
virtual Status Get(const ReadOptions& options, const Slice& key,
std::string* value) = 0;
virtual std::vector<Status> MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values) = 0;
// Read this key and ensure that this transaction will only
// be able to be committed if this key is not written outside this
// transaction after it has first been read (or after the snapshot if a
// snapshot is set in this transaction). The transaction behavior is the
// same regardless of whether the key exists or not.
//
// The values returned by this function are similar to Transaction::Get().
// If value==nullptr, then this function will not read any data, but will
// still ensure that this key cannot be written to by outside of this
// transaction.
//
// If this transaction was created by a TransactionDB, Status::Busy() may be
// returned.
// If this transaction was created by an OptimisticTransaction, GetForUpdate()
// could cause commit() to later return Status::Busy().
virtual Status GetForUpdate(const ReadOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key, std::string* value) = 0;
virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
std::string* value) = 0;
virtual std::vector<Status> MultiGetForUpdate(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
virtual std::vector<Status> MultiGetForUpdate(
const ReadOptions& options, const std::vector<Slice>& keys,
std::vector<std::string>* values) = 0;
// Returns an iterator that will iterate on all keys in the default
// column family including both keys in the DB and uncommitted keys in this
// transaction.
//
// Setting read_options.snapshot will affect what is read from the
// DB but will NOT change which keys are read from this transaction (the keys
// in this transaction do not yet belong to any snapshot and will be fetched
// regardless).
//
// Caller is reponsible for deleting the returned Iterator.
//
// The returned iterator is only valid until Commit(), Rollback(), or
// RollbackToSavePoint() is called.
// NOTE: Transaction::Put/Merge/Delete will currently invalidate this iterator
// until
// the following issue is fixed:
// https://github.com/facebook/rocksdb/issues/616
virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
virtual Iterator* GetIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) = 0;
// Put, Merge, and Delete behave similarly to their corresponding
// functions in WriteBatch, but will also do conflict checking on the
// keys being written.
//
// If this Transaction was created on an OptimisticTransactionDB, these
// functions should always return Status::OK().
// If this Transaction was created on a TransactionDB, the functions can
// return Status::Busy() if they could not acquire a lock.
virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0;
virtual Status Put(const Slice& key, const Slice& value) = 0;
virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) = 0;
virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0;
virtual Status Merge(const Slice& key, const Slice& value) = 0;
virtual Status Delete(ColumnFamilyHandle* column_family,
const Slice& key) = 0;
virtual Status Delete(const Slice& key) = 0;
virtual Status Delete(ColumnFamilyHandle* column_family,
const SliceParts& key) = 0;
virtual Status Delete(const SliceParts& key) = 0;
// PutUntracked() will write a Put to the batch of operations to be committed
// in this transaction. This write will only happen if this transaction
// gets committed successfully. But unlike Transaction::Put(),
// no conflict checking will be done for this key.
//
// If this Transaction was created on a TransactionDB, this function will
// still acquire locks necessary to make sure this write doesn't cause
// conflicts in
// other transactions and may return Status::Busy().
virtual Status PutUntracked(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) = 0;
virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
virtual Status PutUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key,
const SliceParts& value) = 0;
virtual Status PutUntracked(const SliceParts& key,
const SliceParts& value) = 0;
virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) = 0;
virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
const Slice& key) = 0;
virtual Status DeleteUntracked(const Slice& key) = 0;
virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key) = 0;
virtual Status DeleteUntracked(const SliceParts& key) = 0;
// Similar to WriteBatch::PutLogData
virtual void PutLogData(const Slice& blob) = 0;
// Fetch the underlying write batch that contains all pending changes to be
// committed.
//
// Note: You should not write or delete anything from the batch directly and
// should only use the the functions in the Transaction class to
// write to this transaction.
virtual WriteBatchWithIndex* GetWriteBatch() = 0;
protected:
explicit Transaction(const TransactionDB* db) {}
Transaction() {}
private:
// No copying allowed
Transaction(const Transaction&);
void operator=(const Transaction&);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -0,0 +1,130 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/utilities/transaction.h"
// Database with Transaction support.
//
// See transaction.h and examples/transaction_example.cc
namespace rocksdb {
struct TransactionDBOptions {
// Specifies the maximum number of keys that can be locked at the same time
// per column family.
// If the number of locked keys is greater than max_num_locks, transaction
// writes (or GetForUpdate) will return an error.
// If this value is not positive, no limit will be enforced.
int64_t max_num_locks = -1;
// Increasing this value will increase the concurrency by dividing the lock
// table (per column family) into more sub-tables, each with their own
// separate
// mutex.
size_t num_stripes = 16;
// If positive, specifies the default wait timeout in milliseconds when
// a transaction attempts to lock a key if not specified by
// TransactionOptions::lock_timeout.
//
// If 0, no waiting is done if a lock cannot instantly be acquired.
// If negative, there is no timeout. Not using a timeout is not recommended
// as it can lead to deadlocks. Currently, there is no deadlock-detection to
// recover
// from a deadlock.
int64_t transaction_lock_timeout = 1000; // 1 second
// If positive, specifies the wait timeout in milliseconds when writing a key
// OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
// directly).
// If 0, no waiting is done if a lock cannot instantly be acquired.
// If negative, there is no timeout and will block indefinitely when acquiring
// a lock.
//
// Not using a a timeout can lead to deadlocks. Currently, there
// is no deadlock-detection to recover from a deadlock. While DB writes
// cannot deadlock with other DB writes, they can deadlock with a transaction.
// A negative timeout should only be used if all transactions have an small
// expiration set.
int64_t default_lock_timeout = 1000; // 1 second
};
struct TransactionOptions {
// Setting set_snapshot=true is the same as calling
// Transaction::SetSnapshot().
bool set_snapshot = false;
// TODO(agiardullo): TransactionDB does not yet support comparators that allow
// two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
// return 0 if
// a.compare(b) returns 0.
// If positive, specifies the wait timeout in milliseconds when
// a transaction attempts to lock a key.
//
// If 0, no waiting is done if a lock cannot instantly be acquired.
// If negative, TransactionDBOptions::transaction_lock_timeout will be used.
int64_t lock_timeout = -1;
// Expiration duration in milliseconds. If non-negative, transactions that
// last longer than this many milliseconds will fail to commit. If not set,
// a forgotten transaction that is never committed, rolled back, or deleted
// will never relinquish any locks it holds. This could prevent keys from
// being
// written by other writers.
//
// TODO(agiardullo): Improve performance of checking expiration time.
int64_t expiration = -1;
};
class TransactionDB : public StackableDB {
public:
// Open a TransactionDB similar to DB::Open().
static Status Open(const Options& options,
const TransactionDBOptions& txn_db_options,
const std::string& dbname, TransactionDB** dbptr);
static Status Open(const DBOptions& db_options,
const TransactionDBOptions& txn_db_options,
const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles,
TransactionDB** dbptr);
virtual ~TransactionDB() {}
// Starts a new Transaction. Passing set_snapshot=true has the same effect
// as calling Transaction::SetSnapshot().
//
// Caller should delete the returned transaction after calling
// Transaction::Commit() or Transaction::Rollback().
virtual Transaction* BeginTransaction(
const WriteOptions& write_options,
const TransactionOptions& txn_options = TransactionOptions()) = 0;
protected:
// To Create an TransactionDB, call Open()
explicit TransactionDB(DB* db) : StackableDB(db) {}
private:
// No copying allowed
TransactionDB(const TransactionDB&);
void operator=(const TransactionDB&);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -118,6 +118,10 @@ LIB_SOURCES = \
utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \
utilities/transactions/optimistic_transaction_impl.cc \ utilities/transactions/optimistic_transaction_impl.cc \
utilities/transactions/optimistic_transaction_db_impl.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \
utilities/transactions/transaction_db_impl.cc \
utilities/transactions/transaction_lock_mgr.cc \
utilities/transactions/transaction_impl.cc \
utilities/transactions/transaction_util.cc \
utilities/ttl/db_ttl_impl.cc \ utilities/ttl/db_ttl_impl.cc \
utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index.cc \
utilities/write_batch_with_index/write_batch_with_index_internal.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \
@ -235,6 +239,7 @@ TEST_BENCH_SOURCES = \
utilities/spatialdb/spatial_db_test.cc \ utilities/spatialdb/spatial_db_test.cc \
utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \
utilities/transactions/optimistic_transaction_test.cc \ utilities/transactions/optimistic_transaction_test.cc \
utilities/transactions/transaction_test.cc \
utilities/ttl/ttl_test.cc \ utilities/ttl/ttl_test.cc \
utilities/write_batch_with_index/write_batch_with_index_test.cc \ utilities/write_batch_with_index/write_batch_with_index_test.cc \
util/log_write_bench.cc \ util/log_write_bench.cc \

@ -67,6 +67,9 @@ std::string Status::ToString() const {
case kShutdownInProgress: case kShutdownInProgress:
type = "Shutdown in progress: "; type = "Shutdown in progress: ";
break; break;
case kTimedOut:
type = "Operation timed out: ";
break;
case kAborted: case kAborted:
type = "Operation aborted: "; type = "Operation aborted: ";
break; break;

@ -18,10 +18,10 @@
namespace rocksdb { namespace rocksdb {
OptimisticTransaction* OptimisticTransactionDBImpl::BeginTransaction( Transaction* OptimisticTransactionDBImpl::BeginTransaction(
const WriteOptions& write_options, const WriteOptions& write_options,
const OptimisticTransactionOptions& txn_options) { const OptimisticTransactionOptions& txn_options) {
OptimisticTransaction* txn = Transaction* txn =
new OptimisticTransactionImpl(this, write_options, txn_options); new OptimisticTransactionImpl(this, write_options, txn_options);
return txn; return txn;

@ -19,7 +19,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
~OptimisticTransactionDBImpl() {} ~OptimisticTransactionDBImpl() {}
OptimisticTransaction* BeginTransaction( Transaction* BeginTransaction(
const WriteOptions& write_options, const WriteOptions& write_options,
const OptimisticTransactionOptions& txn_options) override; const OptimisticTransactionOptions& txn_options) override;

@ -7,11 +7,7 @@
#include "utilities/transactions/optimistic_transaction_impl.h" #include "utilities/transactions/optimistic_transaction_impl.h"
#ifndef __STDC_FORMAT_MACROS #include <algorithm>
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <string> #include <string>
#include <vector> #include <vector>
@ -22,6 +18,7 @@
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/optimistic_transaction_db.h"
#include "util/string_util.h" #include "util/string_util.h"
#include "utilities/transactions/transaction_util.h"
namespace rocksdb { namespace rocksdb {
@ -34,7 +31,8 @@ OptimisticTransactionImpl::OptimisticTransactionImpl(
db_(txn_db->GetBaseDB()), db_(txn_db->GetBaseDB()),
write_options_(write_options), write_options_(write_options),
snapshot_(nullptr), snapshot_(nullptr),
write_batch_(txn_options.cmp, 0, true) { cmp_(txn_options.cmp),
write_batch_(new WriteBatchWithIndex(txn_options.cmp, 0, true)) {
if (txn_options.set_snapshot) { if (txn_options.set_snapshot) {
SetSnapshot(); SetSnapshot();
} else { } else {
@ -72,11 +70,12 @@ Status OptimisticTransactionImpl::Commit() {
} }
Status s = db_impl->WriteWithCallback( Status s = db_impl->WriteWithCallback(
write_options_, write_batch_.GetWriteBatch(), &callback); write_options_, write_batch_->GetWriteBatch(), &callback);
if (s.ok()) { if (s.ok()) {
tracked_keys_.clear(); tracked_keys_.clear();
write_batch_.Clear(); write_batch_->Clear();
num_entries_ = 0;
} }
return s; return s;
@ -84,7 +83,57 @@ Status OptimisticTransactionImpl::Commit() {
void OptimisticTransactionImpl::Rollback() { void OptimisticTransactionImpl::Rollback() {
tracked_keys_.clear(); tracked_keys_.clear();
write_batch_.Clear(); write_batch_->Clear();
num_entries_ = 0;
}
void OptimisticTransactionImpl::SetSavePoint() {
if (num_entries_ > 0) {
// If transaction is empty, no need to record anything.
if (save_points_ == nullptr) {
save_points_.reset(new std::stack<size_t>());
}
save_points_->push(num_entries_);
}
}
void OptimisticTransactionImpl::RollbackToSavePoint() {
size_t savepoint_entries = 0;
if (save_points_ != nullptr && save_points_->size() > 0) {
savepoint_entries = save_points_->top();
save_points_->pop();
}
assert(savepoint_entries <= num_entries_);
if (savepoint_entries == num_entries_) {
// No changes to rollback
} else if (savepoint_entries == 0) {
// Rollback everything
Rollback();
} else {
DBImpl* db_impl = dynamic_cast<DBImpl*>(db_->GetRootDB());
assert(db_impl);
WriteBatchWithIndex* new_batch = new WriteBatchWithIndex(cmp_, 0, true);
Status s = TransactionUtil::CopyFirstN(
savepoint_entries, write_batch_.get(), new_batch, db_impl);
if (!s.ok()) {
// TODO: Should we change this function to return a Status or should we
// somehow make it
// so RollbackToSavePoint() can never fail??
// Consider moving this functionality into WriteBatchWithIndex
fprintf(stderr, "STATUS: %s \n", s.ToString().c_str());
delete new_batch;
} else {
write_batch_.reset(new_batch);
}
num_entries_ = savepoint_entries;
}
} }
// Record this key so that we can check it for conflicts at commit time. // Record this key so that we can check it for conflicts at commit time.
@ -135,8 +184,8 @@ void OptimisticTransactionImpl::RecordOperation(
Status OptimisticTransactionImpl::Get(const ReadOptions& read_options, Status OptimisticTransactionImpl::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, ColumnFamilyHandle* column_family,
const Slice& key, std::string* value) { const Slice& key, std::string* value) {
return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key,
value); value);
} }
Status OptimisticTransactionImpl::GetForUpdate( Status OptimisticTransactionImpl::GetForUpdate(
@ -145,7 +194,11 @@ Status OptimisticTransactionImpl::GetForUpdate(
// Regardless of whether the Get succeeded, track this key. // Regardless of whether the Get succeeded, track this key.
RecordOperation(column_family, key); RecordOperation(column_family, key);
return Get(read_options, column_family, key, value); if (value == nullptr) {
return Status::OK();
} else {
return Get(read_options, column_family, key, value);
}
} }
std::vector<Status> OptimisticTransactionImpl::MultiGet( std::vector<Status> OptimisticTransactionImpl::MultiGet(
@ -159,7 +212,7 @@ std::vector<Status> OptimisticTransactionImpl::MultiGet(
// TODO(agiardullo): optimize multiget? // TODO(agiardullo): optimize multiget?
std::vector<Status> stat_list(num_keys); std::vector<Status> stat_list(num_keys);
for (size_t i = 0; i < num_keys; ++i) { for (size_t i = 0; i < num_keys; ++i) {
std::string* value = &(*values)[i]; std::string* value = values ? &(*values)[i] : nullptr;
stat_list[i] = Get(read_options, column_family[i], keys[i], value); stat_list[i] = Get(read_options, column_family[i], keys[i], value);
} }
@ -180,169 +233,141 @@ std::vector<Status> OptimisticTransactionImpl::MultiGetForUpdate(
// Regardless of whether the Get succeeded, track this key. // Regardless of whether the Get succeeded, track this key.
RecordOperation(column_family[i], keys[i]); RecordOperation(column_family[i], keys[i]);
std::string* value = &(*values)[i]; std::string* value = values ? &(*values)[i] : nullptr;
stat_list[i] = Get(read_options, column_family[i], keys[i], value); stat_list[i] = Get(read_options, column_family[i], keys[i], value);
} }
return stat_list; return stat_list;
} }
void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, Iterator* OptimisticTransactionImpl::GetIterator(
const Slice& key, const Slice& value) { const ReadOptions& read_options) {
Iterator* db_iter = db_->NewIterator(read_options);
assert(db_iter);
return write_batch_->NewIteratorWithBase(db_iter);
}
Iterator* OptimisticTransactionImpl::GetIterator(
const ReadOptions& read_options, ColumnFamilyHandle* column_family) {
Iterator* db_iter = db_->NewIterator(read_options, column_family);
assert(db_iter);
return write_batch_->NewIteratorWithBase(column_family, db_iter);
}
Status OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) {
RecordOperation(column_family, key); RecordOperation(column_family, key);
write_batch_.Put(column_family, key, value); write_batch_->Put(column_family, key, value);
num_entries_++;
return Status::OK();
} }
void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, Status OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family,
const SliceParts& key, const SliceParts& key,
const SliceParts& value) { const SliceParts& value) {
RecordOperation(column_family, key); RecordOperation(column_family, key);
write_batch_.Put(column_family, key, value); write_batch_->Put(column_family, key, value);
num_entries_++;
return Status::OK();
} }
void OptimisticTransactionImpl::Merge(ColumnFamilyHandle* column_family, Status OptimisticTransactionImpl::Merge(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) { const Slice& key, const Slice& value) {
RecordOperation(column_family, key); RecordOperation(column_family, key);
write_batch_.Merge(column_family, key, value); write_batch_->Merge(column_family, key, value);
return Status::OK();
} }
void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, Status OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family,
const Slice& key) { const Slice& key) {
RecordOperation(column_family, key); RecordOperation(column_family, key);
write_batch_.Delete(column_family, key); write_batch_->Delete(column_family, key);
return Status::OK();
} }
void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, Status OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family,
const SliceParts& key) { const SliceParts& key) {
RecordOperation(column_family, key); RecordOperation(column_family, key);
write_batch_.Delete(column_family, key); write_batch_->Delete(column_family, key);
return Status::OK();
} }
void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, Status OptimisticTransactionImpl::PutUntracked(
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) {
const Slice& value) { write_batch_->Put(column_family, key, value);
write_batch_.Put(column_family, key, value); num_entries_++;
return Status::OK();
} }
void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, Status OptimisticTransactionImpl::PutUntracked(
const SliceParts& key, ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) { const SliceParts& value) {
write_batch_.Put(column_family, key, value); write_batch_->Put(column_family, key, value);
num_entries_++;
return Status::OK();
} }
void OptimisticTransactionImpl::MergeUntracked( Status OptimisticTransactionImpl::MergeUntracked(
ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) {
write_batch_.Merge(column_family, key, value); write_batch_->Merge(column_family, key, value);
num_entries_++;
return Status::OK();
} }
void OptimisticTransactionImpl::DeleteUntracked( Status OptimisticTransactionImpl::DeleteUntracked(
ColumnFamilyHandle* column_family, const Slice& key) { ColumnFamilyHandle* column_family, const Slice& key) {
write_batch_.Delete(column_family, key); write_batch_->Delete(column_family, key);
num_entries_++;
return Status::OK();
} }
void OptimisticTransactionImpl::DeleteUntracked( Status OptimisticTransactionImpl::DeleteUntracked(
ColumnFamilyHandle* column_family, const SliceParts& key) { ColumnFamilyHandle* column_family, const SliceParts& key) {
write_batch_.Delete(column_family, key); write_batch_->Delete(column_family, key);
num_entries_++;
return Status::OK();
} }
void OptimisticTransactionImpl::PutLogData(const Slice& blob) { void OptimisticTransactionImpl::PutLogData(const Slice& blob) {
write_batch_.PutLogData(blob); write_batch_->PutLogData(blob);
num_entries_++;
} }
WriteBatchWithIndex* OptimisticTransactionImpl::GetWriteBatch() { WriteBatchWithIndex* OptimisticTransactionImpl::GetWriteBatch() {
return &write_batch_; return write_batch_.get();
} }
// Returns OK if it is safe to commit this transaction. Returns Status::Busy // Returns OK if it is safe to commit this transaction. Returns Status::Busy
// if there are read or write conflicts that would prevent us from committing OR // if there are read or write conflicts that would prevent us from committing OR
// if we can not determine whether there would be any such conflicts. // if we can not determine whether there would be any such conflicts.
// //
// Should only be called on writer thread. // Should only be called on writer thread in order to avoid any race conditions
// in detecting
// write conflicts.
Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) { Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) {
Status result; Status result;
assert(dynamic_cast<DBImpl*>(db) != nullptr); assert(dynamic_cast<DBImpl*>(db) != nullptr);
auto db_impl = reinterpret_cast<DBImpl*>(db); auto db_impl = reinterpret_cast<DBImpl*>(db);
for (auto& tracked_keys_iter : tracked_keys_) { return TransactionUtil::CheckKeysForConflicts(db_impl, &tracked_keys_);
uint32_t cf_id = tracked_keys_iter.first;
const auto& keys = tracked_keys_iter.second;
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id);
if (sv == nullptr) {
result =
Status::Busy("Could not access column family " + ToString(cf_id));
break;
}
SequenceNumber earliest_seq =
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
// For each of the keys in this transaction, check to see if someone has
// written to this key since the start of the transaction.
for (const auto& key_iter : keys) {
const auto& key = key_iter.first;
const SequenceNumber key_seq = key_iter.second;
// Since it would be too slow to check the SST files, we will only use
// the memtables to check whether there have been any recent writes
// to this key after it was accessed in this transaction. But if the
// Memtables do not contain a long enough history, we must fail the
// transaction.
if (earliest_seq == kMaxSequenceNumber) {
// The age of this memtable is unknown. Cannot rely on it to check
// for recent writes. This error shouldn't happen often in practice as
// the
// Memtable should have a valid earliest sequence number except in some
// corner cases (such as error cases during recovery).
result = Status::Busy(
"Could not commit transaction with as the MemTable does not "
"countain a long enough history to check write at SequenceNumber: ",
ToString(key_seq));
} else if (key_seq < earliest_seq) {
// The age of this memtable is too new to use to check for recent
// writes.
char msg[255];
snprintf(
msg, sizeof(msg),
"Could not commit transaction with write at SequenceNumber %" PRIu64
" as the MemTable only contains changes newer than SequenceNumber "
"%" PRIu64
". Increasing the value of the "
"max_write_buffer_number_to_maintain option could reduce the "
"frequency "
"of this error.",
key_seq, earliest_seq);
result = Status::Busy(msg);
} else {
SequenceNumber seq = kMaxSequenceNumber;
Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq);
if (!s.ok()) {
result = s;
} else if (seq != kMaxSequenceNumber && seq > key_seq) {
result = Status::Busy();
}
}
if (!result.ok()) {
break;
}
}
db_impl->ReturnAndCleanupSuperVersion(cf_id, sv);
if (!result.ok()) {
break;
}
}
return result;
} }
} // namespace rocksdb } // namespace rocksdb

@ -7,6 +7,7 @@
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include <stack>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
@ -16,17 +17,14 @@
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/types.h" #include "rocksdb/types.h"
#include "rocksdb/utilities/optimistic_transaction.h" #include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/optimistic_transaction_db.h"
#include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/utilities/write_batch_with_index.h"
#include "utilities/transactions/transaction_util.h"
namespace rocksdb { namespace rocksdb {
using TransactionKeyMap = class OptimisticTransactionImpl : public Transaction {
std::unordered_map<uint32_t,
std::unordered_map<std::string, SequenceNumber>>;
class OptimisticTransactionImpl : public OptimisticTransaction {
public: public:
OptimisticTransactionImpl(OptimisticTransactionDB* db, OptimisticTransactionImpl(OptimisticTransactionDB* db,
const WriteOptions& write_options, const WriteOptions& write_options,
@ -38,6 +36,10 @@ class OptimisticTransactionImpl : public OptimisticTransaction {
void Rollback() override; void Rollback() override;
void SetSavePoint() override;
void RollbackToSavePoint() override;
Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, std::string* value) override; const Slice& key, std::string* value) override;
@ -84,57 +86,61 @@ class OptimisticTransactionImpl : public OptimisticTransaction {
keys, values); keys, values);
} }
void Put(ColumnFamilyHandle* column_family, const Slice& key, Iterator* GetIterator(const ReadOptions& read_options) override;
const Slice& value) override; Iterator* GetIterator(const ReadOptions& read_options,
void Put(const Slice& key, const Slice& value) override { ColumnFamilyHandle* column_family) override;
Put(nullptr, key, value);
Status Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override;
Status Put(const Slice& key, const Slice& value) override {
return Put(nullptr, key, value);
} }
void Put(ColumnFamilyHandle* column_family, const SliceParts& key, Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) override; const SliceParts& value) override;
void Put(const SliceParts& key, const SliceParts& value) override { Status Put(const SliceParts& key, const SliceParts& value) override {
Put(nullptr, key, value); return Put(nullptr, key, value);
} }
void Merge(ColumnFamilyHandle* column_family, const Slice& key, Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override; const Slice& value) override;
void Merge(const Slice& key, const Slice& value) override { Status Merge(const Slice& key, const Slice& value) override {
Merge(nullptr, key, value); return Merge(nullptr, key, value);
} }
void Delete(ColumnFamilyHandle* column_family, const Slice& key) override; Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
void Delete(const Slice& key) override { Delete(nullptr, key); } Status Delete(const Slice& key) override { return Delete(nullptr, key); }
void Delete(ColumnFamilyHandle* column_family, Status Delete(ColumnFamilyHandle* column_family,
const SliceParts& key) override; const SliceParts& key) override;
void Delete(const SliceParts& key) override { Delete(nullptr, key); } Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
void PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override; const Slice& value) override;
void PutUntracked(const Slice& key, const Slice& value) override { Status PutUntracked(const Slice& key, const Slice& value) override {
PutUntracked(nullptr, key, value); return PutUntracked(nullptr, key, value);
} }
void PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) override; const SliceParts& value) override;
void PutUntracked(const SliceParts& key, const SliceParts& value) override { Status PutUntracked(const SliceParts& key, const SliceParts& value) override {
PutUntracked(nullptr, key, value); return PutUntracked(nullptr, key, value);
} }
void MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override; const Slice& value) override;
void MergeUntracked(const Slice& key, const Slice& value) override { Status MergeUntracked(const Slice& key, const Slice& value) override {
MergeUntracked(nullptr, key, value); return MergeUntracked(nullptr, key, value);
} }
void DeleteUntracked(ColumnFamilyHandle* column_family, Status DeleteUntracked(ColumnFamilyHandle* column_family,
const Slice& key) override; const Slice& key) override;
void DeleteUntracked(const Slice& key) override { Status DeleteUntracked(const Slice& key) override {
DeleteUntracked(nullptr, key); return DeleteUntracked(nullptr, key);
} }
void DeleteUntracked(ColumnFamilyHandle* column_family, Status DeleteUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key) override; const SliceParts& key) override;
void DeleteUntracked(const SliceParts& key) override { Status DeleteUntracked(const SliceParts& key) override {
DeleteUntracked(nullptr, key); return DeleteUntracked(nullptr, key);
} }
void PutLogData(const Slice& blob) override; void PutLogData(const Slice& blob) override;
@ -153,12 +159,24 @@ class OptimisticTransactionImpl : public OptimisticTransaction {
const WriteOptions write_options_; const WriteOptions write_options_;
const Snapshot* snapshot_; const Snapshot* snapshot_;
SequenceNumber start_sequence_number_; SequenceNumber start_sequence_number_;
WriteBatchWithIndex write_batch_; const Comparator* cmp_;
std::unique_ptr<WriteBatchWithIndex> write_batch_;
private: private:
// Map of Column Family IDs to keys and their sequence numbers // Map of Column Family IDs to keys and corresponding sequence numbers.
// The sequence number stored for a key will be used during commit to make
// sure this key has
// not changed since this sequence number.
TransactionKeyMap tracked_keys_; TransactionKeyMap tracked_keys_;
// Records the number of entries currently in the WriteBatch including calls
// to
// Put, Merge, Delete, and PutLogData()
size_t num_entries_ = 0;
// Stack of number of entries in write_batch at each save point
std::unique_ptr<std::stack<size_t>> save_points_;
friend class OptimisticTransactionCallback; friend class OptimisticTransactionCallback;
// Returns OK if it is safe to commit this transaction. Returns Status::Busy // Returns OK if it is safe to commit this transaction. Returns Status::Busy

@ -8,7 +8,7 @@
#include <string> #include <string>
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/utilities/optimistic_transaction.h" #include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/optimistic_transaction_db.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/testharness.h" #include "util/testharness.h"
@ -34,7 +34,6 @@ class OptimisticTransactionTest : public testing::Test {
assert(s.ok()); assert(s.ok());
db = txn_db->GetBaseDB(); db = txn_db->GetBaseDB();
} }
~OptimisticTransactionTest() { ~OptimisticTransactionTest() {
delete txn_db; delete txn_db;
DestroyDB(dbname, options); DestroyDB(dbname, options);
@ -50,7 +49,7 @@ TEST_F(OptimisticTransactionTest, SuccessTest) {
db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo"), Slice("bar"));
db->Put(write_options, Slice("foo2"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar"));
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
txn->GetForUpdate(read_options, "foo", &value); txn->GetForUpdate(read_options, "foo", &value);
@ -79,7 +78,7 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest) {
db->Put(write_options, "foo", "bar"); db->Put(write_options, "foo", "bar");
db->Put(write_options, "foo2", "bar"); db->Put(write_options, "foo2", "bar");
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
txn->Put("foo", "bar2"); txn->Put("foo", "bar2");
@ -114,8 +113,7 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest2) {
db->Put(write_options, "foo2", "bar"); db->Put(write_options, "foo2", "bar");
txn_options.set_snapshot = true; txn_options.set_snapshot = true;
OptimisticTransaction* txn = Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
txn_db->BeginTransaction(write_options, txn_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
// This Put outside of a transaction will conflict with a later write // This Put outside of a transaction will conflict with a later write
@ -150,8 +148,7 @@ TEST_F(OptimisticTransactionTest, ReadConflictTest) {
db->Put(write_options, "foo2", "bar"); db->Put(write_options, "foo2", "bar");
txn_options.set_snapshot = true; txn_options.set_snapshot = true;
OptimisticTransaction* txn = Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
txn_db->BeginTransaction(write_options, txn_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
txn->SetSnapshot(); txn->SetSnapshot();
@ -188,7 +185,7 @@ TEST_F(OptimisticTransactionTest, TxnOnlyTest) {
string value; string value;
Status s; Status s;
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
txn->Put("x", "y"); txn->Put("x", "y");
@ -208,7 +205,7 @@ TEST_F(OptimisticTransactionTest, FlushTest) {
db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo"), Slice("bar"));
db->Put(write_options, Slice("foo2"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar"));
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
snapshot_read_options.snapshot = txn->GetSnapshot(); snapshot_read_options.snapshot = txn->GetSnapshot();
@ -248,7 +245,7 @@ TEST_F(OptimisticTransactionTest, FlushTest2) {
db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo"), Slice("bar"));
db->Put(write_options, Slice("foo2"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar"));
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
snapshot_read_options.snapshot = txn->GetSnapshot(); snapshot_read_options.snapshot = txn->GetSnapshot();
@ -302,7 +299,7 @@ TEST_F(OptimisticTransactionTest, NoSnapshotTest) {
db->Put(write_options, "AAA", "bar"); db->Put(write_options, "AAA", "bar");
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
// Modify key after transaction start // Modify key after transaction start
@ -333,7 +330,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
db->Put(write_options, "BBB", "bar"); db->Put(write_options, "BBB", "bar");
db->Put(write_options, "CCC", "bar"); db->Put(write_options, "CCC", "bar");
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
db->Put(write_options, "AAA", "bar1"); db->Put(write_options, "AAA", "bar1");
@ -410,8 +407,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
OptimisticTransactionOptions txn_options; OptimisticTransactionOptions txn_options;
txn_options.set_snapshot = true; txn_options.set_snapshot = true;
OptimisticTransaction* txn2 = Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
txn_db->BeginTransaction(write_options, txn_options);
txn2->SetSnapshot(); txn2->SetSnapshot();
// This should not conflict in txn since the snapshot is later than the // This should not conflict in txn since the snapshot is later than the
@ -467,15 +463,14 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
ASSERT_OK(s); ASSERT_OK(s);
db = txn_db->GetBaseDB(); db = txn_db->GetBaseDB();
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn); ASSERT_TRUE(txn);
txn->SetSnapshot(); txn->SetSnapshot();
snapshot_read_options.snapshot = txn->GetSnapshot(); snapshot_read_options.snapshot = txn->GetSnapshot();
txn_options.set_snapshot = true; txn_options.set_snapshot = true;
OptimisticTransaction* txn2 = Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
txn_db->BeginTransaction(write_options, txn_options);
ASSERT_TRUE(txn2); ASSERT_TRUE(txn2);
// Write some data to the db // Write some data to the db
@ -594,7 +589,7 @@ TEST_F(OptimisticTransactionTest, EmptyTest) {
s = db->Put(write_options, "aaa", "aaa"); s = db->Put(write_options, "aaa", "aaa");
ASSERT_OK(s); ASSERT_OK(s);
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
s = txn->Commit(); s = txn->Commit();
ASSERT_OK(s); ASSERT_OK(s);
delete txn; delete txn;
@ -630,11 +625,10 @@ TEST_F(OptimisticTransactionTest, PredicateManyPreceders) {
Status s; Status s;
txn_options.set_snapshot = true; txn_options.set_snapshot = true;
OptimisticTransaction* txn1 = Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options);
txn_db->BeginTransaction(write_options, txn_options);
read_options1.snapshot = txn1->GetSnapshot(); read_options1.snapshot = txn1->GetSnapshot();
OptimisticTransaction* txn2 = txn_db->BeginTransaction(write_options); Transaction* txn2 = txn_db->BeginTransaction(write_options);
txn2->SetSnapshot(); txn2->SetSnapshot();
read_options2.snapshot = txn2->GetSnapshot(); read_options2.snapshot = txn2->GetSnapshot();
@ -697,8 +691,8 @@ TEST_F(OptimisticTransactionTest, LostUpdate) {
// Test 2 transactions writing to the same key in multiple orders and // Test 2 transactions writing to the same key in multiple orders and
// with/without snapshots // with/without snapshots
OptimisticTransaction* txn1 = txn_db->BeginTransaction(write_options); Transaction* txn1 = txn_db->BeginTransaction(write_options);
OptimisticTransaction* txn2 = txn_db->BeginTransaction(write_options); Transaction* txn2 = txn_db->BeginTransaction(write_options);
txn1->Put("1", "1"); txn1->Put("1", "1");
txn2->Put("1", "2"); txn2->Put("1", "2");
@ -792,7 +786,7 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) {
Status s; Status s;
// Verify transaction rollback works for untracked keys. // Verify transaction rollback works for untracked keys.
OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); Transaction* txn = txn_db->BeginTransaction(write_options);
txn->PutUntracked("untracked", "0"); txn->PutUntracked("untracked", "0");
txn->Rollback(); txn->Rollback();
s = db->Get(read_options, "untracked", &value); s = db->Get(read_options, "untracked", &value);
@ -836,6 +830,280 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) {
delete txn; delete txn;
} }
TEST_F(OptimisticTransactionTest, IteratorTest) {
WriteOptions write_options;
ReadOptions read_options, snapshot_read_options;
OptimisticTransactionOptions txn_options;
string value;
Status s;
// Write some keys to the db
s = db->Put(write_options, "A", "a");
ASSERT_OK(s);
s = db->Put(write_options, "G", "g");
ASSERT_OK(s);
s = db->Put(write_options, "F", "f");
ASSERT_OK(s);
s = db->Put(write_options, "C", "c");
ASSERT_OK(s);
s = db->Put(write_options, "D", "d");
ASSERT_OK(s);
Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn);
// Write some keys in a txn
s = txn->Put("B", "b");
ASSERT_OK(s);
s = txn->Put("H", "h");
ASSERT_OK(s);
s = txn->Delete("D");
ASSERT_OK(s);
s = txn->Put("E", "e");
ASSERT_OK(s);
txn->SetSnapshot();
const Snapshot* snapshot = txn->GetSnapshot();
// Write some keys to the db after the snapshot
s = db->Put(write_options, "BB", "xx");
ASSERT_OK(s);
s = db->Put(write_options, "C", "xx");
ASSERT_OK(s);
read_options.snapshot = snapshot;
Iterator* iter = txn->GetIterator(read_options);
ASSERT_OK(iter->status());
iter->SeekToFirst();
// Read all keys via iter and lock them all
std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
for (int i = 0; i < 7; i++) {
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(results[i], iter->value().ToString());
s = txn->GetForUpdate(read_options, iter->key(), nullptr);
ASSERT_OK(s);
iter->Next();
}
ASSERT_FALSE(iter->Valid());
iter->Seek("G");
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("g", iter->value().ToString());
iter->Prev();
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("f", iter->value().ToString());
iter->Seek("D");
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("e", iter->value().ToString());
iter->Seek("C");
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("c", iter->value().ToString());
iter->Next();
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("e", iter->value().ToString());
iter->Seek("");
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->value().ToString());
iter->Seek("X");
ASSERT_OK(iter->status());
ASSERT_FALSE(iter->Valid());
iter->SeekToLast();
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("h", iter->value().ToString());
// key "C" was modified in the db after txn's snapshot. txn will not commit.
s = txn->Commit();
ASSERT_TRUE(s.IsBusy());
delete iter;
delete txn;
}
TEST_F(OptimisticTransactionTest, SavepointTest) {
WriteOptions write_options;
ReadOptions read_options, snapshot_read_options;
OptimisticTransactionOptions txn_options;
string value;
Status s;
Transaction* txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn);
txn->RollbackToSavePoint();
txn->SetSavePoint(); // 1
txn->RollbackToSavePoint(); // Rollback to beginning of txn
txn->RollbackToSavePoint();
s = txn->Put("B", "b");
ASSERT_OK(s);
s = txn->Commit();
ASSERT_OK(s);
s = db->Get(read_options, "B", &value);
ASSERT_OK(s);
ASSERT_EQ("b", value);
delete txn;
txn = txn_db->BeginTransaction(write_options);
ASSERT_TRUE(txn);
s = txn->Put("A", "a");
ASSERT_OK(s);
s = txn->Put("B", "bb");
ASSERT_OK(s);
s = txn->Put("C", "c");
ASSERT_OK(s);
txn->SetSavePoint(); // 2
s = txn->Delete("B");
ASSERT_OK(s);
s = txn->Put("C", "cc");
ASSERT_OK(s);
s = txn->Put("D", "d");
ASSERT_OK(s);
txn->RollbackToSavePoint(); // Rollback to 2
s = txn->Get(read_options, "A", &value);
ASSERT_OK(s);
ASSERT_EQ("a", value);
s = txn->Get(read_options, "B", &value);
ASSERT_OK(s);
ASSERT_EQ("bb", value);
s = txn->Get(read_options, "C", &value);
ASSERT_OK(s);
ASSERT_EQ("c", value);
s = txn->Get(read_options, "D", &value);
ASSERT_TRUE(s.IsNotFound());
s = txn->Put("A", "a");
ASSERT_OK(s);
s = txn->Put("E", "e");
ASSERT_OK(s);
txn->RollbackToSavePoint(); // Rollback to beginning of txn
s = txn->Get(read_options, "A", &value);
ASSERT_TRUE(s.IsNotFound());
s = txn->Get(read_options, "B", &value);
ASSERT_OK(s);
ASSERT_EQ("b", value);
s = txn->Get(read_options, "D", &value);
ASSERT_TRUE(s.IsNotFound());
s = txn->Get(read_options, "D", &value);
ASSERT_TRUE(s.IsNotFound());
s = txn->Get(read_options, "E", &value);
ASSERT_TRUE(s.IsNotFound());
s = txn->Put("A", "aa");
ASSERT_OK(s);
s = txn->Put("F", "f");
ASSERT_OK(s);
txn->SetSavePoint(); // 3
txn->SetSavePoint(); // 4
s = txn->Put("G", "g");
ASSERT_OK(s);
s = txn->Delete("F");
ASSERT_OK(s);
s = txn->Delete("B");
ASSERT_OK(s);
s = txn->Get(read_options, "A", &value);
ASSERT_OK(s);
ASSERT_EQ("aa", value);
s = txn->Get(read_options, "F", &value);
ASSERT_TRUE(s.IsNotFound());
s = txn->Get(read_options, "B", &value);
ASSERT_TRUE(s.IsNotFound());
txn->RollbackToSavePoint(); // Rollback to 3
s = txn->Get(read_options, "F", &value);
ASSERT_OK(s);
ASSERT_EQ("f", value);
s = txn->Get(read_options, "G", &value);
ASSERT_TRUE(s.IsNotFound());
s = txn->Commit();
ASSERT_OK(s);
s = db->Get(read_options, "F", &value);
ASSERT_OK(s);
ASSERT_EQ("f", value);
s = db->Get(read_options, "G", &value);
ASSERT_TRUE(s.IsNotFound());
s = db->Get(read_options, "A", &value);
ASSERT_OK(s);
ASSERT_EQ("aa", value);
s = db->Get(read_options, "B", &value);
ASSERT_OK(s);
ASSERT_EQ("b", value);
s = db->Get(read_options, "C", &value);
ASSERT_TRUE(s.IsNotFound());
s = db->Get(read_options, "D", &value);
ASSERT_TRUE(s.IsNotFound());
s = db->Get(read_options, "E", &value);
ASSERT_TRUE(s.IsNotFound());
delete txn;
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -0,0 +1,254 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include "utilities/transactions/transaction_db_impl.h"
#include "db/db_impl.h"
#include "rocksdb/db.h"
#include "rocksdb/options.h"
#include "rocksdb/utilities/transaction_db.h"
#include "utilities/transactions/transaction_impl.h"
namespace rocksdb {
TransactionDBImpl::TransactionDBImpl(DB* db,
const TransactionDBOptions& txn_db_options)
: TransactionDB(db),
txn_db_options_(txn_db_options),
lock_mgr_(txn_db_options_.num_stripes, txn_db_options.max_num_locks) {}
Transaction* TransactionDBImpl::BeginTransaction(
const WriteOptions& write_options, const TransactionOptions& txn_options) {
Transaction* txn = new TransactionImpl(this, write_options, txn_options);
return txn;
}
TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions(
const TransactionDBOptions& txn_db_options) {
TransactionDBOptions validated = txn_db_options;
if (txn_db_options.num_stripes == 0) {
validated.num_stripes = 1;
}
return validated;
}
Status TransactionDB::Open(const Options& options,
const TransactionDBOptions& txn_db_options,
const std::string& dbname, TransactionDB** dbptr) {
DBOptions db_options(options);
ColumnFamilyOptions cf_options(options);
std::vector<ColumnFamilyDescriptor> column_families;
column_families.push_back(
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
std::vector<ColumnFamilyHandle*> handles;
Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
column_families, &handles, dbptr);
if (s.ok()) {
assert(handles.size() == 1);
// i can delete the handle since DBImpl is always holding a reference to
// default column family
delete handles[0];
}
return s;
}
Status TransactionDB::Open(
const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
Status s;
DB* db;
std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
// Enable MemTable History if not already enabled
for (auto& column_family : column_families_copy) {
ColumnFamilyOptions* options = &column_family.options;
if (options->max_write_buffer_number_to_maintain == 0) {
// Setting to -1 will set the History size to max_write_buffer_number.
options->max_write_buffer_number_to_maintain = -1;
}
}
s = DB::Open(db_options, dbname, column_families, handles, &db);
if (s.ok()) {
TransactionDBImpl* txn_db = new TransactionDBImpl(
db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options));
for (auto cf_ptr : *handles) {
txn_db->AddColumnFamily(cf_ptr);
}
*dbptr = txn_db;
}
return s;
}
// Let TransactionLockMgr know that this column family exists so it can
// allocate a LockMap for it.
void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) {
lock_mgr_.AddColumnFamily(handle->GetID());
}
Status TransactionDBImpl::CreateColumnFamily(
const ColumnFamilyOptions& options, const std::string& column_family_name,
ColumnFamilyHandle** handle) {
InstrumentedMutexLock l(&column_family_mutex_);
Status s = db_->CreateColumnFamily(options, column_family_name, handle);
if (s.ok()) {
lock_mgr_.AddColumnFamily((*handle)->GetID());
}
return s;
}
// Let TransactionLockMgr know that it can deallocate the LockMap for this
// column family.
Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
InstrumentedMutexLock l(&column_family_mutex_);
Status s = db_->DropColumnFamily(column_family);
if (s.ok()) {
lock_mgr_.RemoveColumnFamily(column_family->GetID());
}
return s;
}
Status TransactionDBImpl::TryLock(TransactionImpl* txn, uint32_t cfh_id,
const std::string& key) {
return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv());
}
void TransactionDBImpl::UnLock(TransactionImpl* txn, TransactionKeyMap* keys) {
lock_mgr_.UnLock(txn, keys, GetEnv());
}
void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id,
const std::string& key) {
lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
}
// Used when wrapping DB write operations in a transaction
Transaction* TransactionDBImpl::BeginInternalTransaction(
const WriteOptions& options) {
TransactionOptions txn_options;
Transaction* txn = BeginTransaction(options, txn_options);
assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
// Use default timeout for non-transactional writes
txn_impl->SetLockTimeout(txn_db_options_.default_lock_timeout);
return txn;
}
// All user Put, Merge, Delete, and Write requests must be intercepted to make
// sure that they lock all keys that they are writing to avoid causing conflicts
// with any concurent transactions. The easiest way to do this is to wrap all
// write operations in a transaction.
//
// Put(), Merge(), and Delete() only lock a single key per call. Write() will
// sort its keys before locking them. This guarantees that TransactionDB write
// methods cannot deadlock with eachother (but still could deadlock with a
// Transaction).
Status TransactionDBImpl::Put(const WriteOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key, const Slice& val) {
Status s;
Transaction* txn = BeginInternalTransaction(options);
// Since the client didn't create a transaction, they don't care about
// conflict checking for this write. So we just need to do PutUntracked().
s = txn->PutUntracked(column_family, key, val);
if (s.ok()) {
s = txn->Commit();
}
delete txn;
return s;
}
Status TransactionDBImpl::Delete(const WriteOptions& wopts,
ColumnFamilyHandle* column_family,
const Slice& key) {
Status s;
Transaction* txn = BeginInternalTransaction(wopts);
// Since the client didn't create a transaction, they don't care about
// conflict checking for this write. So we just need to do
// DeleteUntracked().
s = txn->DeleteUntracked(column_family, key);
if (s.ok()) {
s = txn->Commit();
}
delete txn;
return s;
}
Status TransactionDBImpl::Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) {
Status s;
Transaction* txn = BeginInternalTransaction(options);
// Since the client didn't create a transaction, they don't care about
// conflict checking for this write. So we just need to do
// MergeUntracked().
s = txn->MergeUntracked(column_family, key, value);
if (s.ok()) {
s = txn->Commit();
}
delete txn;
return s;
}
Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
// Need to lock all keys in this batch to prevent write conflicts with
// concurrent transactions.
Transaction* txn = BeginInternalTransaction(opts);
assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
// Since commitBatch sorts the keys before locking, concurrent Write()
// operations will not cause a deadlock.
// In order to avoid a deadlock with a concurrent Transaction, Transactions
// should use a lock timeout.
Status s = txn_impl->CommitBatch(updates);
delete txn;
return s;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -0,0 +1,80 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include "rocksdb/db.h"
#include "rocksdb/options.h"
#include "rocksdb/utilities/transaction_db.h"
#include "utilities/transactions/transaction_impl.h"
#include "utilities/transactions/transaction_lock_mgr.h"
namespace rocksdb {
class TransactionDBImpl : public TransactionDB {
public:
explicit TransactionDBImpl(DB* db,
const TransactionDBOptions& txn_db_options);
~TransactionDBImpl() {}
Transaction* BeginTransaction(const WriteOptions& write_options,
const TransactionOptions& txn_options) override;
using StackableDB::Put;
virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& val) override;
using StackableDB::Delete;
virtual Status Delete(const WriteOptions& wopts,
ColumnFamilyHandle* column_family,
const Slice& key) override;
using StackableDB::Merge;
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override;
using StackableDB::Write;
virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
using StackableDB::CreateColumnFamily;
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family_name,
ColumnFamilyHandle** handle) override;
using StackableDB::DropColumnFamily;
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
Status TryLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key);
void UnLock(TransactionImpl* txn, TransactionKeyMap* keys);
void UnLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key);
void AddColumnFamily(const ColumnFamilyHandle* handle);
static TransactionDBOptions ValidateTxnDBOptions(
const TransactionDBOptions& txn_db_options);
const TransactionDBOptions& GetTxnDBOptions() const {
return txn_db_options_;
}
private:
const TransactionDBOptions txn_db_options_;
TransactionLockMgr lock_mgr_;
// Must be held when adding/dropping column families.
InstrumentedMutex column_family_mutex_;
Transaction* BeginInternalTransaction(const WriteOptions& options);
Status WriteHelper(WriteBatch* updates, TransactionImpl* txn_impl);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -0,0 +1,598 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#include "utilities/transactions/transaction_impl.h"
#include <map>
#include <set>
#include <string>
#include <vector>
#include "db/column_family.h"
#include "db/db_impl.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/transaction_db.h"
#include "util/string_util.h"
#include "utilities/transactions/transaction_db_impl.h"
#include "utilities/transactions/transaction_util.h"
namespace rocksdb {
struct WriteOptions;
std::atomic<TransactionID> TransactionImpl::txn_id_counter_(1);
TransactionID TransactionImpl::GenTxnID() {
return txn_id_counter_.fetch_add(1);
}
TransactionImpl::TransactionImpl(TransactionDB* txn_db,
const WriteOptions& write_options,
const TransactionOptions& txn_options)
: db_(txn_db),
txn_db_impl_(nullptr),
txn_id_(GenTxnID()),
write_options_(write_options),
snapshot_(nullptr),
cmp_(GetColumnFamilyUserComparator(txn_db->DefaultColumnFamily())),
write_batch_(new WriteBatchWithIndex(cmp_, 0, true)),
start_time_(
txn_options.expiration >= 0 ? db_->GetEnv()->NowMicros() / 1000 : 0),
expiration_time_(txn_options.expiration >= 0
? start_time_ + txn_options.expiration
: 0),
lock_timeout_(txn_options.lock_timeout) {
txn_db_impl_ = dynamic_cast<TransactionDBImpl*>(txn_db);
assert(txn_db_impl_);
if (lock_timeout_ < 0) {
// Lock timeout not set, use default
lock_timeout_ = txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout;
}
if (txn_options.set_snapshot) {
SetSnapshot();
}
}
TransactionImpl::~TransactionImpl() {
Cleanup();
if (snapshot_ != nullptr) {
db_->ReleaseSnapshot(snapshot_);
}
}
void TransactionImpl::SetSnapshot() {
if (snapshot_ != nullptr) {
db_->ReleaseSnapshot(snapshot_);
}
snapshot_ = db_->GetSnapshot();
}
void TransactionImpl::Cleanup() {
write_batch_->Clear();
num_entries_ = 0;
txn_db_impl_->UnLock(this, &tracked_keys_);
tracked_keys_.clear();
save_points_.reset(nullptr);
}
bool TransactionImpl::IsExpired() const {
if (expiration_time_ > 0) {
if (db_->GetEnv()->NowMicros() >= expiration_time_ * 1000) {
// Transaction is expired.
return true;
}
}
return false;
}
Status TransactionImpl::CommitBatch(WriteBatch* batch) {
TransactionKeyMap keys_to_unlock;
Status s = LockBatch(batch, &keys_to_unlock);
if (s.ok()) {
s = DoCommit(batch);
txn_db_impl_->UnLock(this, &keys_to_unlock);
}
return s;
}
Status TransactionImpl::Commit() {
Status s = DoCommit(write_batch_->GetWriteBatch());
Cleanup();
return s;
}
Status TransactionImpl::DoCommit(WriteBatch* batch) {
Status s;
// Do write directly on base db as TransctionDB::Write() would attempt to
// do conflict checking that we've already done.
DB* db = db_->GetBaseDB();
if (expiration_time_ > 0) {
// We cannot commit a transaction that is expired as its locks might have
// been released.
// To avoid race conditions, we need to use a WriteCallback to check the
// expiration time once we're on the writer thread.
TransactionCallback callback(this);
assert(dynamic_cast<DBImpl*>(db) != nullptr);
auto db_impl = reinterpret_cast<DBImpl*>(db);
s = db_impl->WriteWithCallback(write_options_, batch, &callback);
} else {
s = db->Write(write_options_, batch);
}
return s;
}
void TransactionImpl::Rollback() { Cleanup(); }
void TransactionImpl::SetSavePoint() {
if (num_entries_ > 0) {
// If transaction is empty, no need to record anything.
if (save_points_ == nullptr) {
save_points_.reset(new std::stack<size_t>());
}
save_points_->push(num_entries_);
}
}
void TransactionImpl::RollbackToSavePoint() {
size_t savepoint_entries = 0;
if (save_points_ != nullptr && save_points_->size() > 0) {
savepoint_entries = save_points_->top();
save_points_->pop();
}
assert(savepoint_entries <= num_entries_);
if (savepoint_entries == num_entries_) {
// No changes to rollback
} else if (savepoint_entries == 0) {
// Rollback everything
Rollback();
} else {
assert(dynamic_cast<DBImpl*>(db_->GetBaseDB()) != nullptr);
auto db_impl = reinterpret_cast<DBImpl*>(db_->GetBaseDB());
WriteBatchWithIndex* new_batch = new WriteBatchWithIndex(cmp_, 0, true);
Status s = TransactionUtil::CopyFirstN(
savepoint_entries, write_batch_.get(), new_batch, db_impl);
if (!s.ok()) {
// TODO: Should we change this function to return a Status or should we
// somehow make it so RollbackToSavePoint() can never fail?? Not easy to
// handle the case where a client accesses a column family that's been
// dropped.
// After chatting with Siying, I'm going to send a diff that adds
// savepoint support in WriteBatchWithIndex and let reviewers decide which
// approach is cleaner.
fprintf(stderr, "STATUS: %s \n", s.ToString().c_str());
delete new_batch;
} else {
write_batch_.reset(new_batch);
}
num_entries_ = savepoint_entries;
}
}
// Lock all keys in this batch.
// On success, caller should unlock keys_to_unlock
Status TransactionImpl::LockBatch(WriteBatch* batch,
TransactionKeyMap* keys_to_unlock) {
class Handler : public WriteBatch::Handler {
public:
// Sorted map of column_family_id to sorted set of keys.
// Since LockBatch() always locks keys in sorted order, it cannot deadlock
// with itself. We're not using a comparator here since it doesn't matter
// what the sorting is as long as it's consistent.
std::map<uint32_t, std::set<std::string>> keys_;
Handler() {}
void RecordKey(uint32_t column_family_id, const Slice& key) {
std::string key_str = key.ToString();
auto iter = (keys_)[column_family_id].find(key_str);
if (iter == (keys_)[column_family_id].end()) {
// key not yet seen, store it.
(keys_)[column_family_id].insert({std::move(key_str)});
}
}
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
RecordKey(column_family_id, key);
return Status::OK();
}
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
RecordKey(column_family_id, key);
return Status::OK();
}
virtual Status DeleteCF(uint32_t column_family_id,
const Slice& key) override {
RecordKey(column_family_id, key);
return Status::OK();
}
};
// Iterating on this handler will add all keys in this batch into keys
Handler handler;
batch->Iterate(&handler);
Status s;
// Attempt to lock all keys
for (const auto& cf_iter : handler.keys_) {
uint32_t cfh_id = cf_iter.first;
auto& cfh_keys = cf_iter.second;
for (const auto& key_iter : cfh_keys) {
const std::string& key = key_iter;
s = txn_db_impl_->TryLock(this, cfh_id, key);
if (!s.ok()) {
break;
}
(*keys_to_unlock)[cfh_id].insert({std::move(key), kMaxSequenceNumber});
}
if (!s.ok()) {
break;
}
}
if (!s.ok()) {
txn_db_impl_->UnLock(this, keys_to_unlock);
}
return s;
}
Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family,
const SliceParts& key, bool check_snapshot) {
size_t key_size = 0;
for (int i = 0; i < key.num_parts; ++i) {
key_size += key.parts[i].size();
}
std::string str;
str.reserve(key_size);
for (int i = 0; i < key.num_parts; ++i) {
str.append(key.parts[i].data(), key.parts[i].size());
}
return TryLock(column_family, str, check_snapshot);
}
// Attempt to lock this key.
// Returns OK if the key has been successfully locked. Non-ok, otherwise.
// If check_shapshot is true and this transaction has a snapshot set,
// this key will only be locked if there have been no writes to this key since
// the snapshot time.
Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family,
const Slice& key, bool check_snapshot) {
uint32_t cfh_id = GetColumnFamilyID(column_family);
std::string key_str = key.ToString();
bool previously_locked;
Status s;
// lock this key if this transactions hasn't already locked it
auto iter = tracked_keys_[cfh_id].find(key_str);
if (iter == tracked_keys_[cfh_id].end()) {
previously_locked = false;
s = txn_db_impl_->TryLock(this, cfh_id, key_str);
if (s.ok()) {
// Record that we've locked this key
auto result = tracked_keys_[cfh_id].insert({key_str, kMaxSequenceNumber});
iter = result.first;
}
} else {
previously_locked = true;
}
if (s.ok()) {
// If a snapshot is set, we need to make sure the key hasn't been modified
// since the snapshot. This must be done after we locked the key.
if (!check_snapshot || snapshot_ == nullptr) {
// Need to remember the earliest sequence number that we know that this
// key has not been modified after. This is useful if this same
// transaction
// later tries to lock this key again.
if (iter->second == kMaxSequenceNumber) {
// Since we haven't checked a snapshot, we only know this key has not
// been modified since after we locked it.
iter->second = db_->GetLatestSequenceNumber();
}
} else {
// If the key has been previous validated at a sequence number earlier
// than the curent snapshot's sequence number, we already know it has not
// been modified.
bool already_validated = iter->second <= snapshot_->GetSequenceNumber();
if (!already_validated) {
s = CheckKeySequence(column_family, key);
if (s.ok()) {
// Record that there have been no writes to this key after this
// sequence.
iter->second = snapshot_->GetSequenceNumber();
} else {
// Failed to validate key
if (!previously_locked) {
// Unlock key we just locked
txn_db_impl_->UnLock(this, cfh_id, key.ToString());
tracked_keys_[cfh_id].erase(iter);
}
}
}
}
}
return s;
}
// Return OK() if this key has not been modified more recently than the
// transaction snapshot_.
Status TransactionImpl::CheckKeySequence(ColumnFamilyHandle* column_family,
const Slice& key) {
Status result;
if (snapshot_ != nullptr) {
assert(dynamic_cast<DBImpl*>(db_->GetBaseDB()) != nullptr);
auto db_impl = reinterpret_cast<DBImpl*>(db_->GetBaseDB());
ColumnFamilyHandle* cfh = column_family ? column_family :
db_impl->DefaultColumnFamily();
result = TransactionUtil::CheckKeyForConflicts(
db_impl, cfh, key.ToString(),
snapshot_->GetSequenceNumber());
}
return result;
}
Status TransactionImpl::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) {
return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key,
value);
}
Status TransactionImpl::GetForUpdate(const ReadOptions& read_options,
ColumnFamilyHandle* column_family,
const Slice& key, std::string* value) {
Status s = TryLock(column_family, key);
if (s.ok() && value != nullptr) {
s = Get(read_options, column_family, key, value);
}
return s;
}
std::vector<Status> TransactionImpl::MultiGet(
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
size_t num_keys = keys.size();
values->resize(num_keys);
std::vector<Status> stat_list(num_keys);
for (size_t i = 0; i < num_keys; ++i) {
std::string* value = values ? &(*values)[i] : nullptr;
stat_list[i] = Get(read_options, column_family[i], keys[i], value);
}
return stat_list;
}
std::vector<Status> TransactionImpl::MultiGetForUpdate(
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
// Regardless of whether the MultiGet succeeded, track these keys.
size_t num_keys = keys.size();
values->resize(num_keys);
// Lock all keys
for (size_t i = 0; i < num_keys; ++i) {
Status s = TryLock(column_family[i], keys[i]);
if (!s.ok()) {
// Fail entire multiget if we cannot lock all keys
return std::vector<Status>(num_keys, s);
}
}
// TODO(agiardullo): optimize multiget?
std::vector<Status> stat_list(num_keys);
for (size_t i = 0; i < num_keys; ++i) {
std::string* value = values ? &(*values)[i] : nullptr;
stat_list[i] = Get(read_options, column_family[i], keys[i], value);
}
return stat_list;
}
Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options) {
Iterator* db_iter = db_->NewIterator(read_options);
assert(db_iter);
return write_batch_->NewIteratorWithBase(db_iter);
}
Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) {
Iterator* db_iter = db_->NewIterator(read_options, column_family);
assert(db_iter);
return write_batch_->NewIteratorWithBase(column_family, db_iter);
}
Status TransactionImpl::Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
Status s = TryLock(column_family, key);
if (s.ok()) {
write_batch_->Put(column_family, key, value);
num_entries_++;
}
return s;
}
Status TransactionImpl::Put(ColumnFamilyHandle* column_family,
const SliceParts& key, const SliceParts& value) {
Status s = TryLock(column_family, key);
if (s.ok()) {
write_batch_->Put(column_family, key, value);
num_entries_++;
}
return s;
}
Status TransactionImpl::Merge(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) {
Status s = TryLock(column_family, key);
if (s.ok()) {
write_batch_->Merge(column_family, key, value);
num_entries_++;
}
return s;
}
Status TransactionImpl::Delete(ColumnFamilyHandle* column_family,
const Slice& key) {
Status s = TryLock(column_family, key);
if (s.ok()) {
write_batch_->Delete(column_family, key);
num_entries_++;
}
return s;
}
Status TransactionImpl::Delete(ColumnFamilyHandle* column_family,
const SliceParts& key) {
Status s = TryLock(column_family, key);
if (s.ok()) {
write_batch_->Delete(column_family, key);
num_entries_++;
}
return s;
}
Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) {
// Even though we do not care about doing conflict checking for this write,
// we still need to take a lock to make sure we do not cause a conflict with
// some other write. However, we do not need to check if there have been
// any writes since this transaction's snapshot.
bool check_snapshot = false;
// TODO(agiardullo): could optimize by supporting shared txn locks in the
// future
Status s = TryLock(column_family, key, check_snapshot);
if (s.ok()) {
write_batch_->Put(column_family, key, value);
num_entries_++;
}
return s;
}
Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key,
const SliceParts& value) {
bool check_snapshot = false;
Status s = TryLock(column_family, key, check_snapshot);
if (s.ok()) {
write_batch_->Put(column_family, key, value);
num_entries_++;
}
return s;
}
Status TransactionImpl::MergeUntracked(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) {
bool check_snapshot = false;
Status s = TryLock(column_family, key, check_snapshot);
if (s.ok()) {
write_batch_->Merge(column_family, key, value);
num_entries_++;
}
return s;
}
Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
const Slice& key) {
bool check_snapshot = false;
Status s = TryLock(column_family, key, check_snapshot);
if (s.ok()) {
write_batch_->Delete(column_family, key);
num_entries_++;
}
return s;
}
Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key) {
bool check_snapshot = false;
Status s = TryLock(column_family, key, check_snapshot);
if (s.ok()) {
write_batch_->Delete(column_family, key);
num_entries_++;
}
return s;
}
void TransactionImpl::PutLogData(const Slice& blob) {
write_batch_->PutLogData(blob);
num_entries_++;
}
WriteBatchWithIndex* TransactionImpl::GetWriteBatch() {
return write_batch_.get();
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -0,0 +1,263 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <atomic>
#include <stack>
#include <string>
#include <unordered_map>
#include <vector>
#include "db/write_callback.h"
#include "rocksdb/db.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/types.h"
#include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/transaction_db.h"
#include "rocksdb/utilities/write_batch_with_index.h"
#include "utilities/transactions/transaction_util.h"
namespace rocksdb {
using TransactionID = uint64_t;
class TransactionDBImpl;
class TransactionImpl : public Transaction {
public:
TransactionImpl(TransactionDB* db, const WriteOptions& write_options,
const TransactionOptions& txn_options);
virtual ~TransactionImpl();
Status Commit() override;
Status CommitBatch(WriteBatch* batch);
void Rollback() override;
void SetSavePoint() override;
void RollbackToSavePoint() override;
Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, std::string* value) override;
Status Get(const ReadOptions& options, const Slice& key,
std::string* value) override {
return Get(options, db_->DefaultColumnFamily(), key, value);
}
Status GetForUpdate(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) override;
Status GetForUpdate(const ReadOptions& options, const Slice& key,
std::string* value) override {
return GetForUpdate(options, db_->DefaultColumnFamily(), key, value);
}
std::vector<Status> MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override;
std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override {
return MultiGet(options, std::vector<ColumnFamilyHandle*>(
keys.size(), db_->DefaultColumnFamily()),
keys, values);
}
std::vector<Status> MultiGetForUpdate(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override;
std::vector<Status> MultiGetForUpdate(
const ReadOptions& options, const std::vector<Slice>& keys,
std::vector<std::string>* values) override {
return MultiGetForUpdate(options,
std::vector<ColumnFamilyHandle*>(
keys.size(), db_->DefaultColumnFamily()),
keys, values);
}
Iterator* GetIterator(const ReadOptions& read_options) override;
Iterator* GetIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) override;
Status Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override;
Status Put(const Slice& key, const Slice& value) override {
return Put(nullptr, key, value);
}
Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) override;
Status Put(const SliceParts& key, const SliceParts& value) override {
return Put(nullptr, key, value);
}
Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override;
Status Merge(const Slice& key, const Slice& value) override {
return Merge(nullptr, key, value);
}
Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
Status Delete(const Slice& key) override { return Delete(nullptr, key); }
Status Delete(ColumnFamilyHandle* column_family,
const SliceParts& key) override;
Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override;
Status PutUntracked(const Slice& key, const Slice& value) override {
return PutUntracked(nullptr, key, value);
}
Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) override;
Status PutUntracked(const SliceParts& key, const SliceParts& value) override {
return PutUntracked(nullptr, key, value);
}
Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override;
Status MergeUntracked(const Slice& key, const Slice& value) override {
return MergeUntracked(nullptr, key, value);
}
Status DeleteUntracked(ColumnFamilyHandle* column_family,
const Slice& key) override;
Status DeleteUntracked(const Slice& key) override {
return DeleteUntracked(nullptr, key);
}
Status DeleteUntracked(ColumnFamilyHandle* column_family,
const SliceParts& key) override;
Status DeleteUntracked(const SliceParts& key) override {
return DeleteUntracked(nullptr, key);
}
void PutLogData(const Slice& blob) override;
const Snapshot* GetSnapshot() const override { return snapshot_; }
void SetSnapshot() override;
WriteBatchWithIndex* GetWriteBatch() override;
// Generate a new unique transaction identifier
static TransactionID GenTxnID();
TransactionID GetTxnID() const { return txn_id_; }
// Returns the time (in milliseconds according to Env->GetMicros()*1000)
// that this transaction will be expired. Returns 0 if this transaction does
// not expire.
uint64_t GetExpirationTime() const { return expiration_time_; }
// returns true if this transaction has an expiration_time and has expired.
bool IsExpired() const;
// Returns the number of milliseconds a transaction can wait on acquiring a
// lock or -1 if there is no timeout.
int64_t GetLockTimeout() const { return lock_timeout_; }
void SetLockTimeout(int64_t timeout) { lock_timeout_ = timeout; }
private:
TransactionDB* const db_;
TransactionDBImpl* txn_db_impl_;
// Used to create unique ids for transactions.
static std::atomic<TransactionID> txn_id_counter_;
// Unique ID for this transaction
const TransactionID txn_id_;
const WriteOptions write_options_;
// If snapshot_ is set, all keys that locked must also have not been written
// since this snapshot
const Snapshot* snapshot_;
const Comparator* cmp_;
std::unique_ptr<WriteBatchWithIndex> write_batch_;
// If expiration_ is non-zero, start_time_ stores that time the txn was
// constructed,
// in milliseconds.
const uint64_t start_time_;
// If non-zero, this transaction should not be committed after this time (in
// milliseconds)
const uint64_t expiration_time_;
// Timeout in microseconds when locking a key or -1 if there is no timeout.
int64_t lock_timeout_;
// Map from column_family_id to map of keys to Sequence Numbers. Stores keys
// that have been locked.
// The key is known to not have been modified after the Sequence Number
// stored.
TransactionKeyMap tracked_keys_;
// Records the number of entries currently in the WriteBatch include calls to
// PutLogData()
size_t num_entries_ = 0;
// Stack of number of entries in write_batch at each save point
std::unique_ptr<std::stack<size_t>> save_points_;
Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
bool check_snapshot = true);
Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key,
bool check_snapshot = true);
void Cleanup();
Status CheckKeySequence(ColumnFamilyHandle* column_family, const Slice& key);
Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock);
Status DoCommit(WriteBatch* batch);
void RollbackLastN(size_t num);
// No copying allowed
TransactionImpl(const TransactionImpl&);
void operator=(const TransactionImpl&);
};
// Used at commit time to check whether transaction is committing before its
// expiration time.
class TransactionCallback : public WriteCallback {
public:
explicit TransactionCallback(TransactionImpl* txn) : txn_(txn) {}
Status Callback(DB* db) override {
if (txn_->IsExpired()) {
return Status::TimedOut();
} else {
return Status::OK();
}
}
private:
TransactionImpl* txn_;
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -0,0 +1,443 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include "utilities/transactions/transaction_lock_mgr.h"
#include <inttypes.h>
#include <algorithm>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <string>
#include <vector>
#include "rocksdb/slice.h"
#include "util/autovector.h"
#include "util/murmurhash.h"
#include "util/thread_local.h"
namespace rocksdb {
struct LockInfo {
TransactionID txn_id;
uint64_t
expiration_time; // Transaction locks are not valid after this time in ms
LockInfo(TransactionID id, uint64_t time)
: txn_id(id), expiration_time(time) {}
LockInfo(const LockInfo& lock_info)
: txn_id(lock_info.txn_id), expiration_time(lock_info.expiration_time) {}
};
struct LockMapStripe {
// Mutex must be held before modifying keys map
std::timed_mutex stripe_mutex;
// Condition Variable per stripe for waiting on a lock
std::condition_variable_any stripe_cv;
// Locked keys mapped to the info about the transactions that locked them.
// TODO(agiardullo): Explore performance of other data structures.
std::unordered_map<std::string, LockInfo> keys;
};
// Map of #num_stripes LockMapStripes
struct LockMap {
explicit LockMap(size_t num_stripes)
: num_stripes_(num_stripes), lock_map_stripes_(num_stripes) {}
LockMap(const LockMap& lock_map)
: num_stripes_(lock_map.num_stripes_), lock_map_stripes_(num_stripes_) {}
// Number of sepearate LockMapStripes to create, each with their own Mutex
const size_t num_stripes_;
// Count of keys that are currently locked in this column family.
// (Only maintained if TransactionLockMgr::max_num_locks_ is positive.)
std::atomic<int64_t> lock_cnt{0};
std::vector<LockMapStripe> lock_map_stripes_;
size_t GetStripe(const std::string& key) const;
};
namespace {
void UnrefLockMapsCache(void* ptr) {
// Called when a thread exits or a ThreadLocalPtr gets destroyed.
auto lock_maps_cache =
static_cast<std::unordered_map<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
delete lock_maps_cache;
}
} // anonymous namespace
TransactionLockMgr::TransactionLockMgr(size_t default_num_stripes,
int64_t max_num_locks)
: default_num_stripes_(default_num_stripes),
max_num_locks_(max_num_locks),
lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)) {}
TransactionLockMgr::~TransactionLockMgr() {}
size_t LockMap::GetStripe(const std::string& key) const {
assert(num_stripes_ > 0);
static murmur_hash hash;
size_t stripe = hash(key) % num_stripes_;
return stripe;
}
void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) {
InstrumentedMutexLock l(&lock_map_mutex_);
if (lock_maps_.find(column_family_id) == lock_maps_.end()) {
lock_maps_.emplace(
column_family_id,
std::shared_ptr<LockMap>(new LockMap(default_num_stripes_)));
} else {
// column_family already exists in lock map
assert(false);
}
}
void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) {
// Remove lock_map for this column family. Since the lock map is stored
// as a shared ptr, concurrent transactions can still keep keep using it
// until they release their reference to it.
{
InstrumentedMutexLock l(&lock_map_mutex_);
auto lock_maps_iter = lock_maps_.find(column_family_id);
assert(lock_maps_iter != lock_maps_.end());
lock_maps_.erase(lock_maps_iter);
} // lock_map_mutex_
// Clear all thread-local caches
autovector<void*> local_caches;
lock_maps_cache_->Scrape(&local_caches, nullptr);
for (auto cache : local_caches) {
delete static_cast<LockMaps*>(cache);
}
}
// Look up the LockMap shared_ptr for a given column_family_id.
// Note: The LockMap is only valid as long as the caller is still holding on
// to the returned shared_ptr.
std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap(
uint32_t column_family_id) {
// First check thread-local cache
if (lock_maps_cache_->Get() == nullptr) {
lock_maps_cache_->Reset(new LockMaps());
}
auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get());
auto lock_map_iter = lock_maps_cache->find(column_family_id);
if (lock_map_iter != lock_maps_cache->end()) {
// Found lock map for this column family.
return lock_map_iter->second;
}
// Not found in local cache, grab mutex and check shared LockMaps
InstrumentedMutexLock l(&lock_map_mutex_);
lock_map_iter = lock_maps_.find(column_family_id);
if (lock_map_iter == lock_maps_.end()) {
return std::shared_ptr<LockMap>(nullptr);
} else {
// Found lock map. Store in thread-local cache and return.
std::shared_ptr<LockMap>& lock_map = lock_map_iter->second;
lock_maps_cache->insert({column_family_id, lock_map});
return lock_map;
}
}
// Returns true if this lock has expired and can be acquired by another
// transaction.
// If false, returns the number of microseconds until expiration in
// *wait_time_us, or 0 if no expiration.
bool TransactionLockMgr::IsLockExpired(const LockInfo& lock_info, Env* env,
uint64_t* wait_time_us) {
auto now = env->NowMicros();
bool expired = (lock_info.expiration_time > 0 &&
lock_info.expiration_time * 1000 <= now);
if (!expired && lock_info.expiration_time > 0 && wait_time_us != nullptr) {
// return how many microseconds until lock will be expired
*wait_time_us = (lock_info.expiration_time * 1000 - now);
}
return expired;
}
Status TransactionLockMgr::TryLock(const TransactionImpl* txn,
uint32_t column_family_id,
const std::string& key, Env* env) {
// Lookup lock map for this column family id
std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
LockMap* lock_map = lock_map_ptr.get();
if (lock_map == nullptr) {
char msg[255];
snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32,
column_family_id);
return Status::InvalidArgument(msg);
}
// Need to lock the mutex for the stripe that this key hashes to
size_t stripe_num = lock_map->GetStripe(key);
assert(lock_map->lock_map_stripes_.size() > stripe_num);
LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num);
LockInfo lock_info(txn->GetTxnID(), txn->GetExpirationTime());
int64_t timeout = txn->GetLockTimeout();
return AcquireWithTimeout(lock_map, stripe, key, env, timeout, lock_info);
}
// Helper function for TryLock().
Status TransactionLockMgr::AcquireWithTimeout(LockMap* lock_map,
LockMapStripe* stripe,
const std::string& key, Env* env,
int64_t timeout,
const LockInfo& lock_info) {
std::chrono::system_clock::time_point end_time;
if (timeout > 0) {
end_time =
std::chrono::system_clock::now() + std::chrono::milliseconds(timeout);
}
bool locked = true;
if (timeout == 0) {
// If timeout is 0, we do not wait to acquire the lock if it is not
// available
locked = stripe->stripe_mutex.try_lock();
} else if (timeout < 0) {
// If timeout is negative, we wait indefinitely to acquire the lock
stripe->stripe_mutex.lock();
} else {
// If timeout is positive, we attempt to acquire the lock unless we timeout
locked = stripe->stripe_mutex.try_lock_until(end_time);
}
if (!locked) {
// timeout acquiring mutex
return Status::Busy();
}
// Acquire lock if we are able to
uint64_t wait_time_us = 0;
Status result =
AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us);
if (result.IsBusy() && timeout != 0) {
// If we weren't able to acquire the lock, we will keep retrying as long
// as the
// timeout allows.
bool timed_out = false;
do {
// Check to see if the lock expires sooner than our timeout.
std::chrono::system_clock::time_point wait_time_end;
if (wait_time_us > 0 &&
(timeout < 0 ||
wait_time_us < static_cast<uint64_t>(timeout * 1000))) {
wait_time_end = std::chrono::system_clock::now() +
std::chrono::microseconds(wait_time_us);
if (timeout > 0 && wait_time_end >= end_time) {
// lock expiration time is after our timeout.
wait_time_us = 0;
}
} else {
wait_time_us = 0;
}
if (wait_time_us > 0) {
// Wait up to the locks current expiration time
stripe->stripe_cv.wait_until(stripe->stripe_mutex, wait_time_end);
} else if (timeout > 0) {
// Wait until we timeout
auto cv_status =
stripe->stripe_cv.wait_until(stripe->stripe_mutex, end_time);
if (cv_status == std::cv_status::timeout) {
timed_out = true;
// Even though we timed out, we will still make one more attempt to
// acquire lock below (it is possible the lock expired and we
// were never signaled).
}
} else {
// No wait timeout.
stripe->stripe_cv.wait(stripe->stripe_mutex);
}
result =
AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us);
} while (result.IsBusy() && !timed_out);
}
stripe->stripe_mutex.unlock();
return result;
}
// Try to lock this key after we have acquired the mutex.
// Returns the number of microseconds until expiration in *wait_time_us,
// or 0 if no expiration.
// REQUIRED: Stripe mutex must be held.
Status TransactionLockMgr::AcquireLocked(LockMap* lock_map,
LockMapStripe* stripe,
const std::string& key, Env* env,
const LockInfo& txn_lock_info,
uint64_t* wait_time_us) {
Status result;
// Check if this key is already locked
if (stripe->keys.find(key) != stripe->keys.end()) {
// Lock already held
LockInfo& lock_info = stripe->keys.at(key);
if (lock_info.txn_id != txn_lock_info.txn_id) {
// locked by another txn. Check if it's expired
if (IsLockExpired(lock_info, env, wait_time_us)) {
// lock is expired, can steal it
lock_info.txn_id = txn_lock_info.txn_id;
lock_info.expiration_time = txn_lock_info.expiration_time;
// lock_cnt does not change
} else {
result = Status::Busy();
}
}
} else { // Lock not held.
// Check lock limit
if (max_num_locks_ > 0 &&
lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
result =
Status::Busy("Failed to acquire lock due to max_num_locks limit");
} else {
// acquire lock
stripe->keys.insert({key, txn_lock_info});
// Maintain lock count if there is a limit on the number of locks
if (max_num_locks_) {
lock_map->lock_cnt++;
}
}
}
return result;
}
void TransactionLockMgr::UnLock(TransactionImpl* txn, uint32_t column_family_id,
const std::string& key, Env* env) {
std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
LockMap* lock_map = lock_map_ptr.get();
if (lock_map == nullptr) {
// Column Family must have been dropped.
return;
}
// Lock the mutex for the stripe that this key hashes to
size_t stripe_num = lock_map->GetStripe(key);
assert(lock_map->lock_map_stripes_.size() > stripe_num);
LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num);
TransactionID txn_id = txn->GetTxnID();
{
std::lock_guard<std::timed_mutex> lock(stripe->stripe_mutex);
const auto& iter = stripe->keys.find(key);
if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) {
// Found the key we locked. unlock it.
stripe->keys.erase(iter);
if (max_num_locks_ > 0) {
// Maintain lock count if there is a limit on the number of locks.
assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
lock_map->lock_cnt--;
}
} else {
// This key is either not locked or locked by someone else. This should
// only happen if the unlocking transaction has expired.
assert(txn->GetExpirationTime() > 0 &&
txn->GetExpirationTime() * 1000 < env->NowMicros());
}
} // stripe_mutex unlocked
// Signal waiting threads to retry locking
stripe->stripe_cv.notify_all();
}
void TransactionLockMgr::UnLock(const TransactionImpl* txn,
const TransactionKeyMap* key_map, Env* env) {
TransactionID txn_id = txn->GetTxnID();
for (auto& key_map_iter : *key_map) {
uint32_t column_family_id = key_map_iter.first;
auto& keys = key_map_iter.second;
std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
LockMap* lock_map = lock_map_ptr.get();
if (lock_map == nullptr) {
// Column Family must have been dropped.
return;
}
// Bucket keys by lock_map_ stripe
std::unordered_map<size_t, std::vector<const std::string*>> keys_by_stripe(
std::max(keys.size(), lock_map->num_stripes_));
for (auto& key_iter : keys) {
const std::string& key = key_iter.first;
size_t stripe_num = lock_map->GetStripe(key);
keys_by_stripe[stripe_num].push_back(&key);
}
// For each stripe, grab the stripe mutex and unlock all keys in this stripe
for (auto& stripe_iter : keys_by_stripe) {
size_t stripe_num = stripe_iter.first;
auto& stripe_keys = stripe_iter.second;
assert(lock_map->lock_map_stripes_.size() > stripe_num);
LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num);
{
std::lock_guard<std::timed_mutex> lock(stripe->stripe_mutex);
for (const std::string* key : stripe_keys) {
const auto& iter = stripe->keys.find(*key);
if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) {
// Found the key we locked. unlock it.
stripe->keys.erase(iter);
if (max_num_locks_ > 0) {
// Maintain lock count if there is a limit on the number of locks.
assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
lock_map->lock_cnt--;
}
} else {
// This key is either not locked or locked by someone else. This
// should only
// happen if the unlocking transaction has expired.
assert(txn->GetExpirationTime() > 0 &&
txn->GetExpirationTime() * 1000 < env->NowMicros());
}
}
} // stripe_mutex unlocked
// Signal waiting threads to retry locking
stripe->stripe_cv.notify_all();
}
}
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -0,0 +1,90 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <chrono>
#include <string>
#include <unordered_map>
#include <vector>
#include "rocksdb/utilities/transaction.h"
#include "util/instrumented_mutex.h"
#include "util/thread_local.h"
#include "utilities/transactions/transaction_impl.h"
namespace rocksdb {
class ColumnFamilyHandle;
struct LockInfo;
struct LockMap;
struct LockMapStripe;
class Slice;
class TransactionLockMgr {
public:
TransactionLockMgr(size_t default_num_stripes, int64_t max_num_locks);
~TransactionLockMgr();
// Creates a new LockMap for this column family. Caller should guarantee
// that this column family does not already exist.
void AddColumnFamily(uint32_t column_family_id);
// Deletes the LockMap for this column family. Caller should guarantee that
// this column family is no longer in use.
void RemoveColumnFamily(uint32_t column_family_id);
// Attempt to lock key. If OK status is returned, the caller is responsible
// for calling UnLock() on this key.
Status TryLock(const TransactionImpl* txn, uint32_t column_family_id,
const std::string& key, Env* env);
// Unlock a key locked by TryLock(). txn must be the same Transaction that
// locked this key.
void UnLock(const TransactionImpl* txn, const TransactionKeyMap* keys,
Env* env);
void UnLock(TransactionImpl* txn, uint32_t column_family_id,
const std::string& key, Env* env);
private:
// Default number of lock map stripes per column family
const size_t default_num_stripes_;
// Limit on number of keys locked per column family
const int64_t max_num_locks_;
// Must be held when accessing/modifying lock_maps_
InstrumentedMutex lock_map_mutex_;
// Map of ColumnFamilyId to locked key info
using LockMaps = std::unordered_map<uint32_t, std::shared_ptr<LockMap>>;
LockMaps lock_maps_;
// Thread-local cache of entries in lock_maps_. This is an optimization
// to avoid acquiring a mutex in order to look up a LockMap
std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
bool IsLockExpired(const LockInfo& lock_info, Env* env, uint64_t* wait_time);
std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
Status AcquireWithTimeout(LockMap* lock_map, LockMapStripe* stripe,
const std::string& key, Env* env, int64_t timeout,
const LockInfo& lock_info);
Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
const std::string& key, Env* env,
const LockInfo& lock_info, uint64_t* wait_time);
// No copying allowed
TransactionLockMgr(const TransactionLockMgr&);
void operator=(const TransactionLockMgr&);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

File diff suppressed because it is too large Load Diff

@ -0,0 +1,265 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include "utilities/transactions/transaction_util.h"
#include <inttypes.h>
#include <string>
#include <vector>
#include "db/db_impl.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/write_batch_with_index.h"
#include "util/string_util.h"
namespace rocksdb {
Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl,
ColumnFamilyHandle* column_family,
const std::string& key,
SequenceNumber key_seq) {
Status result;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
if (sv == nullptr) {
result = Status::Busy("Could not access column family " +
cfh->GetName());
}
if (result.ok()) {
SequenceNumber earliest_seq =
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
result = CheckKey(db_impl, sv, earliest_seq, key_seq, key);
db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
}
return result;
}
Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
SequenceNumber earliest_seq,
SequenceNumber key_seq,
const std::string& key) {
Status result;
// Since it would be too slow to check the SST files, we will only use
// the memtables to check whether there have been any recent writes
// to this key after it was accessed in this transaction. But if the
// Memtables do not contain a long enough history, we must fail the
// transaction.
if (earliest_seq == kMaxSequenceNumber) {
// The age of this memtable is unknown. Cannot rely on it to check
// for recent writes. This error shouldn't happen often in practice as
// the
// Memtable should have a valid earliest sequence number except in some
// corner cases (such as error cases during recovery).
result = Status::Busy(
"Transaction ould not check for conflicts as the MemTable does not "
"countain a long enough history to check write at SequenceNumber: ",
ToString(key_seq));
} else if (key_seq < earliest_seq) {
// The age of this memtable is too new to use to check for recent
// writes.
char msg[255];
snprintf(msg, sizeof(msg),
"Transaction could not check for conflicts for opearation at "
"SequenceNumber %" PRIu64
" as the MemTable only contains changes newer than SequenceNumber "
"%" PRIu64
". Increasing the value of the "
"max_write_buffer_number_to_maintain option could reduce the "
"frequency "
"of this error.",
key_seq, earliest_seq);
result = Status::Busy(msg);
} else {
SequenceNumber seq = kMaxSequenceNumber;
Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq);
if (!s.ok()) {
result = s;
} else if (seq != kMaxSequenceNumber && seq > key_seq) {
result = Status::Busy();
}
}
return result;
}
Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl,
TransactionKeyMap* key_map) {
Status result;
for (auto& key_map_iter : *key_map) {
uint32_t cf_id = key_map_iter.first;
const auto& keys = key_map_iter.second;
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id);
if (sv == nullptr) {
result =
Status::Busy("Could not access column family " + ToString(cf_id));
break;
}
SequenceNumber earliest_seq =
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
// For each of the keys in this transaction, check to see if someone has
// written to this key since the start of the transaction.
for (const auto& key_iter : keys) {
const auto& key = key_iter.first;
const SequenceNumber key_seq = key_iter.second;
result = CheckKey(db_impl, sv, earliest_seq, key_seq, key);
if (!result.ok()) {
break;
}
}
db_impl->ReturnAndCleanupSuperVersion(cf_id, sv);
if (!result.ok()) {
break;
}
}
return result;
}
Status TransactionUtil::CopyFirstN(size_t num, WriteBatchWithIndex* batch,
WriteBatchWithIndex* new_batch,
DBImpl* db_impl) {
// Handler for iterating through batch and copying entries to new_batch
class Handler : public WriteBatch::Handler {
public:
WriteBatchWithIndex* batch;
const size_t limit;
DBImpl* db_impl;
size_t seen = 0;
std::unordered_map<uint32_t, SuperVersion*> super_versions;
std::unordered_map<uint32_t, ColumnFamilyHandle*> handles;
Handler(WriteBatchWithIndex* dest, size_t new_limit, DBImpl* db)
: batch(dest), limit(new_limit), db_impl(db) {}
~Handler() {
for (auto& iter : super_versions) {
db_impl->ReturnAndCleanupSuperVersionUnlocked(iter.first, iter.second);
}
}
Status GetColumnFamily(uint32_t column_family_id,
ColumnFamilyHandle** cfh) {
// Need to look up ColumnFamilyHandle for this column family id. Since
// doing this requires grabbing a mutex, lets only do it once per column
// family and cache it.
// In order to ensure that the ColumnFamilyHandle is still valid, we need
// to hold the superversion.
const auto& iter = handles.find(column_family_id);
if (iter == handles.end()) {
// Don't have ColumnFamilyHandle cached, look it up from the db.
SuperVersion* sv =
db_impl->GetAndRefSuperVersionUnlocked(column_family_id);
if (sv == nullptr) {
return Status::InvalidArgument(
"Could not find column family for ID " +
ToString(column_family_id));
}
super_versions.insert({column_family_id, sv});
*cfh = db_impl->GetColumnFamilyHandleUnlocked(column_family_id);
if (*cfh == nullptr) {
return Status::InvalidArgument(
"Could not find column family handle for ID " +
ToString(column_family_id));
}
handles.insert({column_family_id, *cfh});
} else {
*cfh = iter->second;
}
return Status::OK();
}
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
if (seen >= limit) {
// Found the first N entries, return Aborted to stop the Iteration.
return Status::Aborted();
}
ColumnFamilyHandle* cfh = nullptr;
Status s = GetColumnFamily(column_family_id, &cfh);
if (s.ok()) {
batch->Put(cfh, key, value);
}
seen++;
return s;
}
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
if (seen >= limit) {
// Found the first N entries, return Aborted to stop the Iteration.
return Status::Aborted();
}
ColumnFamilyHandle* cfh = nullptr;
Status s = GetColumnFamily(column_family_id, &cfh);
if (s.ok()) {
batch->Merge(cfh, key, value);
}
seen++;
return s;
}
virtual Status DeleteCF(uint32_t column_family_id,
const Slice& key) override {
if (seen >= limit) {
// Found the first N entries, return Aborted to stop the Iteration.
return Status::Aborted();
}
ColumnFamilyHandle* cfh = nullptr;
Status s = GetColumnFamily(column_family_id, &cfh);
if (s.ok()) {
batch->Delete(cfh, key);
}
seen++;
return s;
}
virtual void LogData(const Slice& blob) override {
if (seen < limit) {
batch->PutLogData(blob);
}
seen++;
}
};
// Iterating on this handler will add all keys in this batch into a new batch
// up to
// the limit.
Handler handler(new_batch, num, db_impl);
Status s = batch->GetWriteBatch()->Iterate(&handler);
if (s.IsAborted()) {
// Handler returns Aborted when it is done copying to stop the iteration.
s = Status::OK();
}
return s;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -0,0 +1,65 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <unordered_map>
#include "rocksdb/db.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/types.h"
namespace rocksdb {
using TransactionKeyMap =
std::unordered_map<uint32_t,
std::unordered_map<std::string, SequenceNumber>>;
class DBImpl;
struct SuperVersion;
class WriteBatchWithIndex;
class TransactionUtil {
public:
// Verifies there have been no writes to this key in the db since this
// sequence number.
//
// Returns OK on success, BUSY if there is a conflicting write, or other error
// status for any unexpected errors.
static Status CheckKeyForConflicts(DBImpl* db_impl,
ColumnFamilyHandle* column_family,
const std::string& key,
SequenceNumber key_seq);
// For each key,SequenceNumber pair in the TransactionKeyMap, this function
// will verify there have been no writes to the key in the db since that
// sequence number.
//
// Returns OK on success, BUSY if there is a conflicting write, or other error
// status for any unexpected errors.
//
// REQUIRED: this function should only be called on the write thread or if the
// mutex is held.
static Status CheckKeysForConflicts(DBImpl* db_impl, TransactionKeyMap* keys);
// Copies the first num entries from batch into new_batch (including Put,
// Merge, Delete, and PutLogData).
// Returns non-OK on error.
static Status CopyFirstN(size_t num, WriteBatchWithIndex* batch,
WriteBatchWithIndex* new_batch, DBImpl* db_impl);
private:
static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
SequenceNumber earliest_seq, SequenceNumber key_seq,
const std::string& key);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -626,12 +626,15 @@ Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family,
switch (result) { switch (result) {
case WriteBatchWithIndexInternal::Result::kFound: case WriteBatchWithIndexInternal::Result::kFound:
case WriteBatchWithIndexInternal::Result::kError: case WriteBatchWithIndexInternal::Result::kError:
return s; // use returned status
break;
case WriteBatchWithIndexInternal::Result::kDeleted: case WriteBatchWithIndexInternal::Result::kDeleted:
case WriteBatchWithIndexInternal::Result::kNotFound: case WriteBatchWithIndexInternal::Result::kNotFound:
return Status::NotFound(); s = Status::NotFound();
break;
case WriteBatchWithIndexInternal::Result::kMergeInProgress: case WriteBatchWithIndexInternal::Result::kMergeInProgress:
return Status::MergeInProgress(""); s = Status::MergeInProgress("");
break;
default: default:
assert(false); assert(false);
} }
@ -659,8 +662,8 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
std::string batch_value; std::string batch_value;
WriteBatchWithIndexInternal::Result result = WriteBatchWithIndexInternal::Result result =
WriteBatchWithIndexInternal::GetFromBatch( WriteBatchWithIndexInternal::GetFromBatch(
options, this, column_family, key, &merge_context, &rep->comparator, options, this, column_family, key, &merge_context,
&batch_value, &s); &rep->comparator, &batch_value, &s);
if (result == WriteBatchWithIndexInternal::Result::kFound) { if (result == WriteBatchWithIndexInternal::Result::kFound) {
value->assign(batch_value.data(), batch_value.size()); value->assign(batch_value.data(), batch_value.size());

Loading…
Cancel
Save