Summary: Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it. MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues. Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable. Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing. Test Plan: Unit tests, db_bench parallel testing. Reviewers: igor, rven, sdong, yhchiang, yoshinorim Reviewed By: sdong Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba Differential Revision: https://reviews.facebook.net/D40869main
parent
c2868cbc52
commit
c2f2cb0214
@ -0,0 +1,142 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/options.h" |
||||
#include "rocksdb/slice.h" |
||||
#include "rocksdb/utilities/transaction.h" |
||||
#include "rocksdb/utilities/optimistic_transaction_db.h" |
||||
|
||||
using namespace rocksdb; |
||||
|
||||
std::string kDBPath = "/tmp/rocksdb_transaction_example"; |
||||
|
||||
int main() { |
||||
// open DB
|
||||
Options options; |
||||
options.create_if_missing = true; |
||||
DB* db; |
||||
OptimisticTransactionDB* txn_db; |
||||
|
||||
Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db); |
||||
assert(s.ok()); |
||||
db = txn_db->GetBaseDB(); |
||||
|
||||
WriteOptions write_options; |
||||
ReadOptions read_options; |
||||
OptimisticTransactionOptions txn_options; |
||||
std::string value; |
||||
|
||||
////////////////////////////////////////////////////////
|
||||
//
|
||||
// Simple OptimisticTransaction Example ("Read Committed")
|
||||
//
|
||||
////////////////////////////////////////////////////////
|
||||
|
||||
// Start a transaction
|
||||
Transaction* txn = txn_db->BeginTransaction(write_options); |
||||
assert(txn); |
||||
|
||||
// Read a key in this transaction
|
||||
s = txn->Get(read_options, "abc", &value); |
||||
assert(s.IsNotFound()); |
||||
|
||||
// Write a key in this transaction
|
||||
txn->Put("abc", "def"); |
||||
|
||||
// Read a key OUTSIDE this transaction. Does not affect txn.
|
||||
s = db->Get(read_options, "abc", &value); |
||||
|
||||
// Write a key OUTSIDE of this transaction.
|
||||
// Does not affect txn since this is an unrelated key. If we wrote key 'abc'
|
||||
// here, the transaction would fail to commit.
|
||||
s = db->Put(write_options, "xyz", "zzz"); |
||||
|
||||
// Commit transaction
|
||||
s = txn->Commit(); |
||||
assert(s.ok()); |
||||
delete txn; |
||||
|
||||
////////////////////////////////////////////////////////
|
||||
//
|
||||
// "Repeatable Read" (Snapshot Isolation) Example
|
||||
// -- Using a single Snapshot
|
||||
//
|
||||
////////////////////////////////////////////////////////
|
||||
|
||||
// Set a snapshot at start of transaction by setting set_snapshot=true
|
||||
txn_options.set_snapshot = true; |
||||
txn = txn_db->BeginTransaction(write_options, txn_options); |
||||
|
||||
const Snapshot* snapshot = txn->GetSnapshot(); |
||||
|
||||
// Write a key OUTSIDE of transaction
|
||||
db->Put(write_options, "abc", "xyz"); |
||||
|
||||
// Read a key using the snapshot
|
||||
read_options.snapshot = snapshot; |
||||
s = txn->GetForUpdate(read_options, "abc", &value); |
||||
assert(value == "def"); |
||||
|
||||
// Attempt to commit transaction
|
||||
s = txn->Commit(); |
||||
|
||||
// Transaction could not commit since the write outside of the txn conflicted
|
||||
// with the read!
|
||||
assert(s.IsBusy()); |
||||
|
||||
delete txn; |
||||
// Clear snapshot from read options since it is no longer valid
|
||||
read_options.snapshot = nullptr; |
||||
snapshot = nullptr; |
||||
|
||||
////////////////////////////////////////////////////////
|
||||
//
|
||||
// "Read Committed" (Monotonic Atomic Views) Example
|
||||
// --Using multiple Snapshots
|
||||
//
|
||||
////////////////////////////////////////////////////////
|
||||
|
||||
// In this example, we set the snapshot multiple times. This is probably
|
||||
// only necessary if you have very strict isolation requirements to
|
||||
// implement.
|
||||
|
||||
// Set a snapshot at start of transaction
|
||||
txn_options.set_snapshot = true; |
||||
txn = txn_db->BeginTransaction(write_options, txn_options); |
||||
|
||||
// Do some reads and writes to key "x"
|
||||
read_options.snapshot = db->GetSnapshot(); |
||||
s = txn->Get(read_options, "x", &value); |
||||
txn->Put("x", "x"); |
||||
|
||||
// Do a write outside of the transaction to key "y"
|
||||
s = db->Put(write_options, "y", "y"); |
||||
|
||||
// Set a new snapshot in the transaction
|
||||
txn->SetSnapshot(); |
||||
read_options.snapshot = db->GetSnapshot(); |
||||
|
||||
// Do some reads and writes to key "y"
|
||||
s = txn->GetForUpdate(read_options, "y", &value); |
||||
txn->Put("y", "y"); |
||||
|
||||
// Commit. Since the snapshot was advanced, the write done outside of the
|
||||
// transaction does not prevent this transaction from Committing.
|
||||
s = txn->Commit(); |
||||
assert(s.ok()); |
||||
delete txn; |
||||
// Clear snapshot from read options since it is no longer valid
|
||||
read_options.snapshot = nullptr; |
||||
|
||||
// Cleanup
|
||||
delete txn_db; |
||||
DestroyDB(kDBPath, options); |
||||
return 0; |
||||
} |
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -1,233 +0,0 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/status.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class OptimisticTransactionDB; |
||||
class WriteBatchWithIndex; |
||||
|
||||
// Provides BEGIN/COMMIT/ROLLBACK transactions for batched writes.
|
||||
//
|
||||
// The current implementation provides optimistic concurrency control.
|
||||
// Transactional reads/writes will not block other operations in the
|
||||
// db. At commit time, the batch of writes will only be written if there have
|
||||
// been no other writes to any keys read or written by this transaction.
|
||||
// Otherwise, the commit will return an error.
|
||||
//
|
||||
// A new optimistic transaction is created by calling
|
||||
// OptimisticTransactionDB::BeginTransaction().
|
||||
// Only reads/writes done through this transaction object will be a part of the
|
||||
// transaction. Any other reads/writes will not be tracked by this
|
||||
// transaction.
|
||||
//
|
||||
// For example, reading data via OptimisticTransaction::GetForUpdate() will
|
||||
// prevent the transaction from committing if this key is written to outside of
|
||||
// this transaction. Any reads done via DB::Get() will not be checked for
|
||||
// conflicts at commit time.
|
||||
//
|
||||
// It is up to the caller to synchronize access to this object.
|
||||
//
|
||||
// See examples/transaction_example.cc for some simple examples.
|
||||
//
|
||||
// TODO(agiardullo): Not yet implemented:
|
||||
// -Transaction support for iterators
|
||||
// -Ensuring memtable holds large enough history to check for conflicts
|
||||
// -Support for using Transactions with DBWithTTL
|
||||
|
||||
// Options to use when starting an Optimistic Transaction
|
||||
struct OptimisticTransactionOptions { |
||||
// Setting set_snapshot=true is the same as calling SetSnapshot().
|
||||
bool set_snapshot = false; |
||||
|
||||
// Should be set if the DB has a non-default comparator.
|
||||
// See comment in WriteBatchWithIndex constructor.
|
||||
const Comparator* cmp = BytewiseComparator(); |
||||
}; |
||||
|
||||
class OptimisticTransaction { |
||||
public: |
||||
virtual ~OptimisticTransaction() {} |
||||
|
||||
// If SetSnapshot() is not called, all keys read/written through this
|
||||
// transaction will only be committed if there have been no writes to
|
||||
// these keys outside of this transaction *since the time each key
|
||||
// was first read/written* in this transaction.
|
||||
//
|
||||
// When SetSnapshot() is called, this transaction will create a Snapshot
|
||||
// to use for conflict validation of all future operations in the transaction.
|
||||
// All future keys read/written will only be committed if there have been
|
||||
// no writes to these keys outside of this transaction *since SetSnapshot()
|
||||
// was called.* Otherwise, Commit() will not succeed.
|
||||
//
|
||||
// It is not necessary to call SetSnapshot() if you only care about other
|
||||
// writes happening on keys *after* they have first been read/written in this
|
||||
// transaction. However, you should set a snapshot if you are concerned
|
||||
// with any other writes happening since a particular time (such as
|
||||
// the start of the transaction).
|
||||
//
|
||||
// SetSnapshot() may be called multiple times if you would like to change
|
||||
// the snapshot used for different operations in this transaction.
|
||||
//
|
||||
// Calling SetSnapshot will not affect the version of Data returned by Get()
|
||||
// methods. See OptimisticTransaction::Get() for more details.
|
||||
//
|
||||
// TODO(agiardullo): add better documentation here once memtable change are
|
||||
// committed
|
||||
virtual void SetSnapshot() = 0; |
||||
|
||||
// Returns the Snapshot created by the last call to SetSnapshot().
|
||||
//
|
||||
// REQUIRED: The returned Snapshot is only valid up until the next time
|
||||
// SetSnapshot() is called or the OptimisticTransaction is deleted.
|
||||
virtual const Snapshot* GetSnapshot() const = 0; |
||||
|
||||
// Write all batched keys to the db atomically if there have not been any
|
||||
// other writes performed on the keys read/written by this transaction.
|
||||
//
|
||||
// Currently, Commit() only checks the memtables to verify that there are no
|
||||
// other writes to these keys. If the memtable's history is not long
|
||||
// enough to verify that there are no conflicts, Commit() will return
|
||||
// a non-OK status.
|
||||
//
|
||||
// Returns OK on success, non-OK on failure.
|
||||
virtual Status Commit() = 0; |
||||
|
||||
// Discard all batched writes in this transaction.
|
||||
virtual void Rollback() = 0; |
||||
|
||||
// This function is similar to DB::Get() except it will also read pending
|
||||
// changes in this transaction.
|
||||
//
|
||||
// If read_options.snapshot is not set, the current version of the key will
|
||||
// be read. Calling SetSnapshot() does not affect the version of the data
|
||||
// returned.
|
||||
//
|
||||
// Note that setting read_options.snapshot will affect what is read from the
|
||||
// DB but will NOT change which keys are read from this transaction (the keys
|
||||
// in this transaction do not yet belong to any snapshot and will be fetched
|
||||
// regardless).
|
||||
//
|
||||
virtual Status Get(const ReadOptions& options, |
||||
ColumnFamilyHandle* column_family, const Slice& key, |
||||
std::string* value) = 0; |
||||
|
||||
virtual Status Get(const ReadOptions& options, const Slice& key, |
||||
std::string* value) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGet( |
||||
const ReadOptions& options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options, |
||||
const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) = 0; |
||||
|
||||
// Read this key and ensure that this transaction will only
|
||||
// be able to be committed if this key is not written outside this
|
||||
// transaction after it has first been read (or after the snapshot if a
|
||||
// snapshot is set in this transaction).
|
||||
|
||||
// This function is similar to OptimisticTransaction::Get() except it will
|
||||
// affect whether this transaction will be able to be committed.
|
||||
virtual Status GetForUpdate(const ReadOptions& options, |
||||
ColumnFamilyHandle* column_family, |
||||
const Slice& key, std::string* value) = 0; |
||||
|
||||
virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, |
||||
std::string* value) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGetForUpdate( |
||||
const ReadOptions& options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGetForUpdate( |
||||
const ReadOptions& options, const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) = 0; |
||||
|
||||
// Put, Merge, and Delete behave similarly to their corresponding
|
||||
// functions in WriteBatch. In addition, this transaction will only
|
||||
// be able to be committed if these keys are not written outside of this
|
||||
// transaction after they have been written by this transaction (or after the
|
||||
// snapshot if a snapshot is set in this transaction).
|
||||
virtual void Put(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) = 0; |
||||
virtual void Put(const Slice& key, const Slice& value) = 0; |
||||
virtual void Put(ColumnFamilyHandle* column_family, const SliceParts& key, |
||||
const SliceParts& value) = 0; |
||||
virtual void Put(const SliceParts& key, const SliceParts& value) = 0; |
||||
|
||||
virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) = 0; |
||||
virtual void Merge(const Slice& key, const Slice& value) = 0; |
||||
|
||||
virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key) = 0; |
||||
virtual void Delete(const Slice& key) = 0; |
||||
virtual void Delete(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) = 0; |
||||
virtual void Delete(const SliceParts& key) = 0; |
||||
|
||||
// PutUntracked() will write a Put to the batch of operations to be committed
|
||||
// in this transaction. This write will only happen if this transaction
|
||||
// gets committed successfully. But unlike OptimisticTransaction::Put(),
|
||||
// no conflict checking will be done for this key. So any other writes to
|
||||
// this key outside of this transaction will not prevent this transaction from
|
||||
// committing.
|
||||
virtual void PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) = 0; |
||||
virtual void PutUntracked(const Slice& key, const Slice& value) = 0; |
||||
virtual void PutUntracked(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key, const SliceParts& value) = 0; |
||||
virtual void PutUntracked(const SliceParts& key, const SliceParts& value) = 0; |
||||
|
||||
virtual void MergeUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& value) = 0; |
||||
virtual void MergeUntracked(const Slice& key, const Slice& value) = 0; |
||||
|
||||
virtual void DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key) = 0; |
||||
|
||||
virtual void DeleteUntracked(const Slice& key) = 0; |
||||
virtual void DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) = 0; |
||||
virtual void DeleteUntracked(const SliceParts& key) = 0; |
||||
|
||||
// Similar to WriteBatch::PutLogData
|
||||
virtual void PutLogData(const Slice& blob) = 0; |
||||
|
||||
// Fetch the underlying write batch that contains all pending changes to be
|
||||
// committed.
|
||||
//
|
||||
// Note: You should not write or delete anything from the batch directly and
|
||||
// should only use the the functions in the OptimisticTransaction class to
|
||||
// write to this transaction.
|
||||
virtual WriteBatchWithIndex* GetWriteBatch() = 0; |
||||
|
||||
protected: |
||||
// To begin a new transaction, see OptimisticTransactionDB::BeginTransaction()
|
||||
explicit OptimisticTransaction(const OptimisticTransactionDB* db) {} |
||||
OptimisticTransaction() {} |
||||
|
||||
private: |
||||
// No copying allowed
|
||||
OptimisticTransaction(const OptimisticTransaction&); |
||||
void operator=(const OptimisticTransaction&); |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,260 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/status.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class Iterator; |
||||
class TransactionDB; |
||||
class WriteBatchWithIndex; |
||||
|
||||
// Provides BEGIN/COMMIT/ROLLBACK transactions.
|
||||
//
|
||||
// To use transactions, you must first create either an OptimisticTransactionDB
|
||||
// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
|
||||
// more information.
|
||||
//
|
||||
// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
|
||||
//
|
||||
// It is up to the caller to synchronize access to this object.
|
||||
//
|
||||
// See examples/transaction_example.cc for some simple examples.
|
||||
//
|
||||
// TODO(agiardullo): Not yet implemented
|
||||
// -PerfContext statistics
|
||||
// -Support for using Transactions with DBWithTTL
|
||||
class Transaction { |
||||
public: |
||||
virtual ~Transaction() {} |
||||
|
||||
// If a transaction has a snapshot set, the transaction will ensure that
|
||||
// any keys successfully written(or fetched via GetForUpdate()) have not
|
||||
// been modified outside of this transaction since the time the snapshot was
|
||||
// set.
|
||||
// If a snapshot has not been set, the transaction guarantees that keys have
|
||||
// not been modified since the time each key was first written (or fetched via
|
||||
// GetForUpdate()).
|
||||
//
|
||||
// Using SetSnapshot() will provide stricter isolation guarantees at the
|
||||
// expense of potentially more transaction failures due to conflicts with
|
||||
// other writes.
|
||||
//
|
||||
// Calling SetSnapshot() has no effect on keys written before this function
|
||||
// has been called.
|
||||
//
|
||||
// SetSnapshot() may be called multiple times if you would like to change
|
||||
// the snapshot used for different operations in this transaction.
|
||||
//
|
||||
// Calling SetSnapshot will not affect the version of Data returned by Get()
|
||||
// methods. See Transaction::Get() for more details.
|
||||
virtual void SetSnapshot() = 0; |
||||
|
||||
// Returns the Snapshot created by the last call to SetSnapshot().
|
||||
//
|
||||
// REQUIRED: The returned Snapshot is only valid up until the next time
|
||||
// SetSnapshot() is called or the Transaction is deleted.
|
||||
virtual const Snapshot* GetSnapshot() const = 0; |
||||
|
||||
// Write all batched keys to the db atomically.
|
||||
//
|
||||
// Returns OK on success.
|
||||
//
|
||||
// May return any error status that could be returned by DB:Write().
|
||||
//
|
||||
// If this transaction was created by an OptimisticTransactionDB(),
|
||||
// Status::Busy() may be returned if the transaction could not guarantee
|
||||
// that there are no write conflicts.
|
||||
//
|
||||
// If this transaction was created by a TransactionDB(), Status::TimedOut()
|
||||
// may be returned if this transaction has lived for longer than
|
||||
// TransactionOptions.expiration.
|
||||
virtual Status Commit() = 0; |
||||
|
||||
// Discard all batched writes in this transaction.
|
||||
virtual void Rollback() = 0; |
||||
|
||||
// Records the state of the transaction for future calls to
|
||||
// RollbackToSavePoint(). May be called multiple times to set multiple save
|
||||
// points.
|
||||
virtual void SetSavePoint() = 0; |
||||
|
||||
// Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
|
||||
// since the
|
||||
// most recent call to SetSavePoint() and removes the most recent
|
||||
// SetSavePoint().
|
||||
// If there is no previous call to SetSavePoint(), behaves the same as
|
||||
// Rollback()
|
||||
virtual void RollbackToSavePoint() = 0; |
||||
|
||||
// This function is similar to DB::Get() except it will also read pending
|
||||
// changes in this transaction.
|
||||
//
|
||||
// If read_options.snapshot is not set, the current version of the key will
|
||||
// be read. Calling SetSnapshot() does not affect the version of the data
|
||||
// returned.
|
||||
//
|
||||
// Note that setting read_options.snapshot will affect what is read from the
|
||||
// DB but will NOT change which keys are read from this transaction (the keys
|
||||
// in this transaction do not yet belong to any snapshot and will be fetched
|
||||
// regardless).
|
||||
virtual Status Get(const ReadOptions& options, |
||||
ColumnFamilyHandle* column_family, const Slice& key, |
||||
std::string* value) = 0; |
||||
|
||||
virtual Status Get(const ReadOptions& options, const Slice& key, |
||||
std::string* value) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGet( |
||||
const ReadOptions& options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options, |
||||
const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) = 0; |
||||
|
||||
// Read this key and ensure that this transaction will only
|
||||
// be able to be committed if this key is not written outside this
|
||||
// transaction after it has first been read (or after the snapshot if a
|
||||
// snapshot is set in this transaction). The transaction behavior is the
|
||||
// same regardless of whether the key exists or not.
|
||||
//
|
||||
// The values returned by this function are similar to Transaction::Get().
|
||||
// If value==nullptr, then this function will not read any data, but will
|
||||
// still ensure that this key cannot be written to by outside of this
|
||||
// transaction.
|
||||
//
|
||||
// If this transaction was created by a TransactionDB, Status::Busy() may be
|
||||
// returned.
|
||||
// If this transaction was created by an OptimisticTransaction, GetForUpdate()
|
||||
// could cause commit() to later return Status::Busy().
|
||||
virtual Status GetForUpdate(const ReadOptions& options, |
||||
ColumnFamilyHandle* column_family, |
||||
const Slice& key, std::string* value) = 0; |
||||
|
||||
virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, |
||||
std::string* value) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGetForUpdate( |
||||
const ReadOptions& options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; |
||||
|
||||
virtual std::vector<Status> MultiGetForUpdate( |
||||
const ReadOptions& options, const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) = 0; |
||||
|
||||
// Returns an iterator that will iterate on all keys in the default
|
||||
// column family including both keys in the DB and uncommitted keys in this
|
||||
// transaction.
|
||||
//
|
||||
// Setting read_options.snapshot will affect what is read from the
|
||||
// DB but will NOT change which keys are read from this transaction (the keys
|
||||
// in this transaction do not yet belong to any snapshot and will be fetched
|
||||
// regardless).
|
||||
//
|
||||
// Caller is reponsible for deleting the returned Iterator.
|
||||
//
|
||||
// The returned iterator is only valid until Commit(), Rollback(), or
|
||||
// RollbackToSavePoint() is called.
|
||||
// NOTE: Transaction::Put/Merge/Delete will currently invalidate this iterator
|
||||
// until
|
||||
// the following issue is fixed:
|
||||
// https://github.com/facebook/rocksdb/issues/616
|
||||
virtual Iterator* GetIterator(const ReadOptions& read_options) = 0; |
||||
|
||||
virtual Iterator* GetIterator(const ReadOptions& read_options, |
||||
ColumnFamilyHandle* column_family) = 0; |
||||
|
||||
// Put, Merge, and Delete behave similarly to their corresponding
|
||||
// functions in WriteBatch, but will also do conflict checking on the
|
||||
// keys being written.
|
||||
//
|
||||
// If this Transaction was created on an OptimisticTransactionDB, these
|
||||
// functions should always return Status::OK().
|
||||
// If this Transaction was created on a TransactionDB, the functions can
|
||||
// return Status::Busy() if they could not acquire a lock.
|
||||
virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) = 0; |
||||
virtual Status Put(const Slice& key, const Slice& value) = 0; |
||||
virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, |
||||
const SliceParts& value) = 0; |
||||
virtual Status Put(const SliceParts& key, const SliceParts& value) = 0; |
||||
|
||||
virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) = 0; |
||||
virtual Status Merge(const Slice& key, const Slice& value) = 0; |
||||
|
||||
virtual Status Delete(ColumnFamilyHandle* column_family, |
||||
const Slice& key) = 0; |
||||
virtual Status Delete(const Slice& key) = 0; |
||||
virtual Status Delete(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) = 0; |
||||
virtual Status Delete(const SliceParts& key) = 0; |
||||
|
||||
// PutUntracked() will write a Put to the batch of operations to be committed
|
||||
// in this transaction. This write will only happen if this transaction
|
||||
// gets committed successfully. But unlike Transaction::Put(),
|
||||
// no conflict checking will be done for this key.
|
||||
//
|
||||
// If this Transaction was created on a TransactionDB, this function will
|
||||
// still acquire locks necessary to make sure this write doesn't cause
|
||||
// conflicts in
|
||||
// other transactions and may return Status::Busy().
|
||||
virtual Status PutUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& value) = 0; |
||||
virtual Status PutUntracked(const Slice& key, const Slice& value) = 0; |
||||
virtual Status PutUntracked(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key, |
||||
const SliceParts& value) = 0; |
||||
virtual Status PutUntracked(const SliceParts& key, |
||||
const SliceParts& value) = 0; |
||||
|
||||
virtual Status MergeUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& value) = 0; |
||||
virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0; |
||||
|
||||
virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key) = 0; |
||||
|
||||
virtual Status DeleteUntracked(const Slice& key) = 0; |
||||
virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) = 0; |
||||
virtual Status DeleteUntracked(const SliceParts& key) = 0; |
||||
|
||||
// Similar to WriteBatch::PutLogData
|
||||
virtual void PutLogData(const Slice& blob) = 0; |
||||
|
||||
// Fetch the underlying write batch that contains all pending changes to be
|
||||
// committed.
|
||||
//
|
||||
// Note: You should not write or delete anything from the batch directly and
|
||||
// should only use the the functions in the Transaction class to
|
||||
// write to this transaction.
|
||||
virtual WriteBatchWithIndex* GetWriteBatch() = 0; |
||||
|
||||
protected: |
||||
explicit Transaction(const TransactionDB* db) {} |
||||
Transaction() {} |
||||
|
||||
private: |
||||
// No copying allowed
|
||||
Transaction(const Transaction&); |
||||
void operator=(const Transaction&); |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,130 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/utilities/stackable_db.h" |
||||
#include "rocksdb/utilities/transaction.h" |
||||
|
||||
// Database with Transaction support.
|
||||
//
|
||||
// See transaction.h and examples/transaction_example.cc
|
||||
|
||||
namespace rocksdb { |
||||
|
||||
struct TransactionDBOptions { |
||||
// Specifies the maximum number of keys that can be locked at the same time
|
||||
// per column family.
|
||||
// If the number of locked keys is greater than max_num_locks, transaction
|
||||
// writes (or GetForUpdate) will return an error.
|
||||
// If this value is not positive, no limit will be enforced.
|
||||
int64_t max_num_locks = -1; |
||||
|
||||
// Increasing this value will increase the concurrency by dividing the lock
|
||||
// table (per column family) into more sub-tables, each with their own
|
||||
// separate
|
||||
// mutex.
|
||||
size_t num_stripes = 16; |
||||
|
||||
// If positive, specifies the default wait timeout in milliseconds when
|
||||
// a transaction attempts to lock a key if not specified by
|
||||
// TransactionOptions::lock_timeout.
|
||||
//
|
||||
// If 0, no waiting is done if a lock cannot instantly be acquired.
|
||||
// If negative, there is no timeout. Not using a timeout is not recommended
|
||||
// as it can lead to deadlocks. Currently, there is no deadlock-detection to
|
||||
// recover
|
||||
// from a deadlock.
|
||||
int64_t transaction_lock_timeout = 1000; // 1 second
|
||||
|
||||
// If positive, specifies the wait timeout in milliseconds when writing a key
|
||||
// OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
|
||||
// directly).
|
||||
// If 0, no waiting is done if a lock cannot instantly be acquired.
|
||||
// If negative, there is no timeout and will block indefinitely when acquiring
|
||||
// a lock.
|
||||
//
|
||||
// Not using a a timeout can lead to deadlocks. Currently, there
|
||||
// is no deadlock-detection to recover from a deadlock. While DB writes
|
||||
// cannot deadlock with other DB writes, they can deadlock with a transaction.
|
||||
// A negative timeout should only be used if all transactions have an small
|
||||
// expiration set.
|
||||
int64_t default_lock_timeout = 1000; // 1 second
|
||||
}; |
||||
|
||||
struct TransactionOptions { |
||||
// Setting set_snapshot=true is the same as calling
|
||||
// Transaction::SetSnapshot().
|
||||
bool set_snapshot = false; |
||||
|
||||
|
||||
// TODO(agiardullo): TransactionDB does not yet support comparators that allow
|
||||
// two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
|
||||
// return 0 if
|
||||
// a.compare(b) returns 0.
|
||||
|
||||
|
||||
// If positive, specifies the wait timeout in milliseconds when
|
||||
// a transaction attempts to lock a key.
|
||||
//
|
||||
// If 0, no waiting is done if a lock cannot instantly be acquired.
|
||||
// If negative, TransactionDBOptions::transaction_lock_timeout will be used.
|
||||
int64_t lock_timeout = -1; |
||||
|
||||
// Expiration duration in milliseconds. If non-negative, transactions that
|
||||
// last longer than this many milliseconds will fail to commit. If not set,
|
||||
// a forgotten transaction that is never committed, rolled back, or deleted
|
||||
// will never relinquish any locks it holds. This could prevent keys from
|
||||
// being
|
||||
// written by other writers.
|
||||
//
|
||||
// TODO(agiardullo): Improve performance of checking expiration time.
|
||||
int64_t expiration = -1; |
||||
}; |
||||
|
||||
class TransactionDB : public StackableDB { |
||||
public: |
||||
// Open a TransactionDB similar to DB::Open().
|
||||
static Status Open(const Options& options, |
||||
const TransactionDBOptions& txn_db_options, |
||||
const std::string& dbname, TransactionDB** dbptr); |
||||
|
||||
static Status Open(const DBOptions& db_options, |
||||
const TransactionDBOptions& txn_db_options, |
||||
const std::string& dbname, |
||||
const std::vector<ColumnFamilyDescriptor>& column_families, |
||||
std::vector<ColumnFamilyHandle*>* handles, |
||||
TransactionDB** dbptr); |
||||
|
||||
virtual ~TransactionDB() {} |
||||
|
||||
// Starts a new Transaction. Passing set_snapshot=true has the same effect
|
||||
// as calling Transaction::SetSnapshot().
|
||||
//
|
||||
// Caller should delete the returned transaction after calling
|
||||
// Transaction::Commit() or Transaction::Rollback().
|
||||
virtual Transaction* BeginTransaction( |
||||
const WriteOptions& write_options, |
||||
const TransactionOptions& txn_options = TransactionOptions()) = 0; |
||||
|
||||
protected: |
||||
// To Create an TransactionDB, call Open()
|
||||
explicit TransactionDB(DB* db) : StackableDB(db) {} |
||||
|
||||
private: |
||||
// No copying allowed
|
||||
TransactionDB(const TransactionDB&); |
||||
void operator=(const TransactionDB&); |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,254 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "utilities/transactions/transaction_db_impl.h" |
||||
|
||||
#include "db/db_impl.h" |
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/options.h" |
||||
#include "rocksdb/utilities/transaction_db.h" |
||||
#include "utilities/transactions/transaction_impl.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
TransactionDBImpl::TransactionDBImpl(DB* db, |
||||
const TransactionDBOptions& txn_db_options) |
||||
: TransactionDB(db), |
||||
txn_db_options_(txn_db_options), |
||||
lock_mgr_(txn_db_options_.num_stripes, txn_db_options.max_num_locks) {} |
||||
|
||||
Transaction* TransactionDBImpl::BeginTransaction( |
||||
const WriteOptions& write_options, const TransactionOptions& txn_options) { |
||||
Transaction* txn = new TransactionImpl(this, write_options, txn_options); |
||||
|
||||
return txn; |
||||
} |
||||
|
||||
TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions( |
||||
const TransactionDBOptions& txn_db_options) { |
||||
TransactionDBOptions validated = txn_db_options; |
||||
|
||||
if (txn_db_options.num_stripes == 0) { |
||||
validated.num_stripes = 1; |
||||
} |
||||
|
||||
return validated; |
||||
} |
||||
|
||||
Status TransactionDB::Open(const Options& options, |
||||
const TransactionDBOptions& txn_db_options, |
||||
const std::string& dbname, TransactionDB** dbptr) { |
||||
DBOptions db_options(options); |
||||
ColumnFamilyOptions cf_options(options); |
||||
std::vector<ColumnFamilyDescriptor> column_families; |
||||
column_families.push_back( |
||||
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); |
||||
std::vector<ColumnFamilyHandle*> handles; |
||||
Status s = TransactionDB::Open(db_options, txn_db_options, dbname, |
||||
column_families, &handles, dbptr); |
||||
if (s.ok()) { |
||||
assert(handles.size() == 1); |
||||
// i can delete the handle since DBImpl is always holding a reference to
|
||||
// default column family
|
||||
delete handles[0]; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionDB::Open( |
||||
const DBOptions& db_options, const TransactionDBOptions& txn_db_options, |
||||
const std::string& dbname, |
||||
const std::vector<ColumnFamilyDescriptor>& column_families, |
||||
std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) { |
||||
Status s; |
||||
DB* db; |
||||
|
||||
std::vector<ColumnFamilyDescriptor> column_families_copy = column_families; |
||||
|
||||
// Enable MemTable History if not already enabled
|
||||
for (auto& column_family : column_families_copy) { |
||||
ColumnFamilyOptions* options = &column_family.options; |
||||
|
||||
if (options->max_write_buffer_number_to_maintain == 0) { |
||||
// Setting to -1 will set the History size to max_write_buffer_number.
|
||||
options->max_write_buffer_number_to_maintain = -1; |
||||
} |
||||
} |
||||
|
||||
s = DB::Open(db_options, dbname, column_families, handles, &db); |
||||
|
||||
if (s.ok()) { |
||||
TransactionDBImpl* txn_db = new TransactionDBImpl( |
||||
db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options)); |
||||
|
||||
for (auto cf_ptr : *handles) { |
||||
txn_db->AddColumnFamily(cf_ptr); |
||||
} |
||||
|
||||
*dbptr = txn_db; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
// Let TransactionLockMgr know that this column family exists so it can
|
||||
// allocate a LockMap for it.
|
||||
void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) { |
||||
lock_mgr_.AddColumnFamily(handle->GetID()); |
||||
} |
||||
|
||||
Status TransactionDBImpl::CreateColumnFamily( |
||||
const ColumnFamilyOptions& options, const std::string& column_family_name, |
||||
ColumnFamilyHandle** handle) { |
||||
InstrumentedMutexLock l(&column_family_mutex_); |
||||
|
||||
Status s = db_->CreateColumnFamily(options, column_family_name, handle); |
||||
if (s.ok()) { |
||||
lock_mgr_.AddColumnFamily((*handle)->GetID()); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
// Let TransactionLockMgr know that it can deallocate the LockMap for this
|
||||
// column family.
|
||||
Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { |
||||
InstrumentedMutexLock l(&column_family_mutex_); |
||||
|
||||
Status s = db_->DropColumnFamily(column_family); |
||||
if (s.ok()) { |
||||
lock_mgr_.RemoveColumnFamily(column_family->GetID()); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionDBImpl::TryLock(TransactionImpl* txn, uint32_t cfh_id, |
||||
const std::string& key) { |
||||
return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv()); |
||||
} |
||||
|
||||
void TransactionDBImpl::UnLock(TransactionImpl* txn, TransactionKeyMap* keys) { |
||||
lock_mgr_.UnLock(txn, keys, GetEnv()); |
||||
} |
||||
|
||||
void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id, |
||||
const std::string& key) { |
||||
lock_mgr_.UnLock(txn, cfh_id, key, GetEnv()); |
||||
} |
||||
|
||||
// Used when wrapping DB write operations in a transaction
|
||||
Transaction* TransactionDBImpl::BeginInternalTransaction( |
||||
const WriteOptions& options) { |
||||
TransactionOptions txn_options; |
||||
Transaction* txn = BeginTransaction(options, txn_options); |
||||
|
||||
assert(dynamic_cast<TransactionImpl*>(txn) != nullptr); |
||||
auto txn_impl = reinterpret_cast<TransactionImpl*>(txn); |
||||
|
||||
// Use default timeout for non-transactional writes
|
||||
txn_impl->SetLockTimeout(txn_db_options_.default_lock_timeout); |
||||
|
||||
return txn; |
||||
} |
||||
|
||||
// All user Put, Merge, Delete, and Write requests must be intercepted to make
|
||||
// sure that they lock all keys that they are writing to avoid causing conflicts
|
||||
// with any concurent transactions. The easiest way to do this is to wrap all
|
||||
// write operations in a transaction.
|
||||
//
|
||||
// Put(), Merge(), and Delete() only lock a single key per call. Write() will
|
||||
// sort its keys before locking them. This guarantees that TransactionDB write
|
||||
// methods cannot deadlock with eachother (but still could deadlock with a
|
||||
// Transaction).
|
||||
Status TransactionDBImpl::Put(const WriteOptions& options, |
||||
ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& val) { |
||||
Status s; |
||||
|
||||
Transaction* txn = BeginInternalTransaction(options); |
||||
|
||||
// Since the client didn't create a transaction, they don't care about
|
||||
// conflict checking for this write. So we just need to do PutUntracked().
|
||||
s = txn->PutUntracked(column_family, key, val); |
||||
|
||||
if (s.ok()) { |
||||
s = txn->Commit(); |
||||
} |
||||
|
||||
delete txn; |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionDBImpl::Delete(const WriteOptions& wopts, |
||||
ColumnFamilyHandle* column_family, |
||||
const Slice& key) { |
||||
Status s; |
||||
|
||||
Transaction* txn = BeginInternalTransaction(wopts); |
||||
|
||||
// Since the client didn't create a transaction, they don't care about
|
||||
// conflict checking for this write. So we just need to do
|
||||
// DeleteUntracked().
|
||||
s = txn->DeleteUntracked(column_family, key); |
||||
|
||||
if (s.ok()) { |
||||
s = txn->Commit(); |
||||
} |
||||
|
||||
delete txn; |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionDBImpl::Merge(const WriteOptions& options, |
||||
ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& value) { |
||||
Status s; |
||||
|
||||
Transaction* txn = BeginInternalTransaction(options); |
||||
|
||||
// Since the client didn't create a transaction, they don't care about
|
||||
// conflict checking for this write. So we just need to do
|
||||
// MergeUntracked().
|
||||
s = txn->MergeUntracked(column_family, key, value); |
||||
|
||||
if (s.ok()) { |
||||
s = txn->Commit(); |
||||
} |
||||
|
||||
delete txn; |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) { |
||||
// Need to lock all keys in this batch to prevent write conflicts with
|
||||
// concurrent transactions.
|
||||
Transaction* txn = BeginInternalTransaction(opts); |
||||
|
||||
assert(dynamic_cast<TransactionImpl*>(txn) != nullptr); |
||||
auto txn_impl = reinterpret_cast<TransactionImpl*>(txn); |
||||
|
||||
// Since commitBatch sorts the keys before locking, concurrent Write()
|
||||
// operations will not cause a deadlock.
|
||||
// In order to avoid a deadlock with a concurrent Transaction, Transactions
|
||||
// should use a lock timeout.
|
||||
Status s = txn_impl->CommitBatch(updates); |
||||
|
||||
delete txn; |
||||
|
||||
return s; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,80 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <string> |
||||
|
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/options.h" |
||||
#include "rocksdb/utilities/transaction_db.h" |
||||
#include "utilities/transactions/transaction_impl.h" |
||||
#include "utilities/transactions/transaction_lock_mgr.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class TransactionDBImpl : public TransactionDB { |
||||
public: |
||||
explicit TransactionDBImpl(DB* db, |
||||
const TransactionDBOptions& txn_db_options); |
||||
|
||||
~TransactionDBImpl() {} |
||||
|
||||
Transaction* BeginTransaction(const WriteOptions& write_options, |
||||
const TransactionOptions& txn_options) override; |
||||
|
||||
using StackableDB::Put; |
||||
virtual Status Put(const WriteOptions& options, |
||||
ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& val) override; |
||||
|
||||
using StackableDB::Delete; |
||||
virtual Status Delete(const WriteOptions& wopts, |
||||
ColumnFamilyHandle* column_family, |
||||
const Slice& key) override; |
||||
|
||||
using StackableDB::Merge; |
||||
virtual Status Merge(const WriteOptions& options, |
||||
ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) override; |
||||
|
||||
using StackableDB::Write; |
||||
virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; |
||||
|
||||
using StackableDB::CreateColumnFamily; |
||||
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, |
||||
const std::string& column_family_name, |
||||
ColumnFamilyHandle** handle) override; |
||||
|
||||
using StackableDB::DropColumnFamily; |
||||
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; |
||||
|
||||
Status TryLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key); |
||||
|
||||
void UnLock(TransactionImpl* txn, TransactionKeyMap* keys); |
||||
void UnLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key); |
||||
|
||||
void AddColumnFamily(const ColumnFamilyHandle* handle); |
||||
|
||||
static TransactionDBOptions ValidateTxnDBOptions( |
||||
const TransactionDBOptions& txn_db_options); |
||||
|
||||
const TransactionDBOptions& GetTxnDBOptions() const { |
||||
return txn_db_options_; |
||||
} |
||||
|
||||
private: |
||||
const TransactionDBOptions txn_db_options_; |
||||
TransactionLockMgr lock_mgr_; |
||||
|
||||
// Must be held when adding/dropping column families.
|
||||
InstrumentedMutex column_family_mutex_; |
||||
Transaction* BeginInternalTransaction(const WriteOptions& options); |
||||
Status WriteHelper(WriteBatch* updates, TransactionImpl* txn_impl); |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,598 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include "utilities/transactions/transaction_impl.h" |
||||
|
||||
#include <map> |
||||
#include <set> |
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "db/column_family.h" |
||||
#include "db/db_impl.h" |
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/status.h" |
||||
#include "rocksdb/utilities/transaction_db.h" |
||||
#include "util/string_util.h" |
||||
#include "utilities/transactions/transaction_db_impl.h" |
||||
#include "utilities/transactions/transaction_util.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
struct WriteOptions; |
||||
|
||||
std::atomic<TransactionID> TransactionImpl::txn_id_counter_(1); |
||||
|
||||
TransactionID TransactionImpl::GenTxnID() { |
||||
return txn_id_counter_.fetch_add(1); |
||||
} |
||||
|
||||
TransactionImpl::TransactionImpl(TransactionDB* txn_db, |
||||
const WriteOptions& write_options, |
||||
const TransactionOptions& txn_options) |
||||
: db_(txn_db), |
||||
txn_db_impl_(nullptr), |
||||
txn_id_(GenTxnID()), |
||||
write_options_(write_options), |
||||
snapshot_(nullptr), |
||||
cmp_(GetColumnFamilyUserComparator(txn_db->DefaultColumnFamily())), |
||||
write_batch_(new WriteBatchWithIndex(cmp_, 0, true)), |
||||
start_time_( |
||||
txn_options.expiration >= 0 ? db_->GetEnv()->NowMicros() / 1000 : 0), |
||||
expiration_time_(txn_options.expiration >= 0 |
||||
? start_time_ + txn_options.expiration |
||||
: 0), |
||||
lock_timeout_(txn_options.lock_timeout) { |
||||
txn_db_impl_ = dynamic_cast<TransactionDBImpl*>(txn_db); |
||||
assert(txn_db_impl_); |
||||
|
||||
if (lock_timeout_ < 0) { |
||||
// Lock timeout not set, use default
|
||||
lock_timeout_ = txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout; |
||||
} |
||||
|
||||
if (txn_options.set_snapshot) { |
||||
SetSnapshot(); |
||||
} |
||||
} |
||||
|
||||
TransactionImpl::~TransactionImpl() { |
||||
Cleanup(); |
||||
|
||||
if (snapshot_ != nullptr) { |
||||
db_->ReleaseSnapshot(snapshot_); |
||||
} |
||||
} |
||||
|
||||
void TransactionImpl::SetSnapshot() { |
||||
if (snapshot_ != nullptr) { |
||||
db_->ReleaseSnapshot(snapshot_); |
||||
} |
||||
|
||||
snapshot_ = db_->GetSnapshot(); |
||||
} |
||||
|
||||
void TransactionImpl::Cleanup() { |
||||
write_batch_->Clear(); |
||||
num_entries_ = 0; |
||||
txn_db_impl_->UnLock(this, &tracked_keys_); |
||||
tracked_keys_.clear(); |
||||
save_points_.reset(nullptr); |
||||
} |
||||
|
||||
bool TransactionImpl::IsExpired() const { |
||||
if (expiration_time_ > 0) { |
||||
if (db_->GetEnv()->NowMicros() >= expiration_time_ * 1000) { |
||||
// Transaction is expired.
|
||||
return true; |
||||
} |
||||
} |
||||
|
||||
return false; |
||||
} |
||||
|
||||
Status TransactionImpl::CommitBatch(WriteBatch* batch) { |
||||
TransactionKeyMap keys_to_unlock; |
||||
|
||||
Status s = LockBatch(batch, &keys_to_unlock); |
||||
|
||||
if (s.ok()) { |
||||
s = DoCommit(batch); |
||||
|
||||
txn_db_impl_->UnLock(this, &keys_to_unlock); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::Commit() { |
||||
Status s = DoCommit(write_batch_->GetWriteBatch()); |
||||
|
||||
Cleanup(); |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::DoCommit(WriteBatch* batch) { |
||||
Status s; |
||||
|
||||
// Do write directly on base db as TransctionDB::Write() would attempt to
|
||||
// do conflict checking that we've already done.
|
||||
DB* db = db_->GetBaseDB(); |
||||
|
||||
if (expiration_time_ > 0) { |
||||
// We cannot commit a transaction that is expired as its locks might have
|
||||
// been released.
|
||||
// To avoid race conditions, we need to use a WriteCallback to check the
|
||||
// expiration time once we're on the writer thread.
|
||||
TransactionCallback callback(this); |
||||
|
||||
assert(dynamic_cast<DBImpl*>(db) != nullptr); |
||||
auto db_impl = reinterpret_cast<DBImpl*>(db); |
||||
s = db_impl->WriteWithCallback(write_options_, batch, &callback); |
||||
} else { |
||||
s = db->Write(write_options_, batch); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
void TransactionImpl::Rollback() { Cleanup(); } |
||||
|
||||
void TransactionImpl::SetSavePoint() { |
||||
if (num_entries_ > 0) { |
||||
// If transaction is empty, no need to record anything.
|
||||
|
||||
if (save_points_ == nullptr) { |
||||
save_points_.reset(new std::stack<size_t>()); |
||||
} |
||||
save_points_->push(num_entries_); |
||||
} |
||||
} |
||||
|
||||
void TransactionImpl::RollbackToSavePoint() { |
||||
size_t savepoint_entries = 0; |
||||
|
||||
if (save_points_ != nullptr && save_points_->size() > 0) { |
||||
savepoint_entries = save_points_->top(); |
||||
save_points_->pop(); |
||||
} |
||||
|
||||
assert(savepoint_entries <= num_entries_); |
||||
|
||||
if (savepoint_entries == num_entries_) { |
||||
// No changes to rollback
|
||||
} else if (savepoint_entries == 0) { |
||||
// Rollback everything
|
||||
Rollback(); |
||||
} else { |
||||
assert(dynamic_cast<DBImpl*>(db_->GetBaseDB()) != nullptr); |
||||
auto db_impl = reinterpret_cast<DBImpl*>(db_->GetBaseDB()); |
||||
|
||||
WriteBatchWithIndex* new_batch = new WriteBatchWithIndex(cmp_, 0, true); |
||||
Status s = TransactionUtil::CopyFirstN( |
||||
savepoint_entries, write_batch_.get(), new_batch, db_impl); |
||||
if (!s.ok()) { |
||||
// TODO: Should we change this function to return a Status or should we
|
||||
// somehow make it so RollbackToSavePoint() can never fail?? Not easy to
|
||||
// handle the case where a client accesses a column family that's been
|
||||
// dropped.
|
||||
// After chatting with Siying, I'm going to send a diff that adds
|
||||
// savepoint support in WriteBatchWithIndex and let reviewers decide which
|
||||
// approach is cleaner.
|
||||
fprintf(stderr, "STATUS: %s \n", s.ToString().c_str()); |
||||
delete new_batch; |
||||
} else { |
||||
write_batch_.reset(new_batch); |
||||
} |
||||
|
||||
num_entries_ = savepoint_entries; |
||||
} |
||||
} |
||||
|
||||
// Lock all keys in this batch.
|
||||
// On success, caller should unlock keys_to_unlock
|
||||
Status TransactionImpl::LockBatch(WriteBatch* batch, |
||||
TransactionKeyMap* keys_to_unlock) { |
||||
class Handler : public WriteBatch::Handler { |
||||
public: |
||||
// Sorted map of column_family_id to sorted set of keys.
|
||||
// Since LockBatch() always locks keys in sorted order, it cannot deadlock
|
||||
// with itself. We're not using a comparator here since it doesn't matter
|
||||
// what the sorting is as long as it's consistent.
|
||||
std::map<uint32_t, std::set<std::string>> keys_; |
||||
|
||||
Handler() {} |
||||
|
||||
void RecordKey(uint32_t column_family_id, const Slice& key) { |
||||
std::string key_str = key.ToString(); |
||||
|
||||
auto iter = (keys_)[column_family_id].find(key_str); |
||||
if (iter == (keys_)[column_family_id].end()) { |
||||
// key not yet seen, store it.
|
||||
(keys_)[column_family_id].insert({std::move(key_str)}); |
||||
} |
||||
} |
||||
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key, |
||||
const Slice& value) override { |
||||
RecordKey(column_family_id, key); |
||||
return Status::OK(); |
||||
} |
||||
virtual Status MergeCF(uint32_t column_family_id, const Slice& key, |
||||
const Slice& value) override { |
||||
RecordKey(column_family_id, key); |
||||
return Status::OK(); |
||||
} |
||||
virtual Status DeleteCF(uint32_t column_family_id, |
||||
const Slice& key) override { |
||||
RecordKey(column_family_id, key); |
||||
return Status::OK(); |
||||
} |
||||
}; |
||||
|
||||
// Iterating on this handler will add all keys in this batch into keys
|
||||
Handler handler; |
||||
batch->Iterate(&handler); |
||||
|
||||
Status s; |
||||
|
||||
// Attempt to lock all keys
|
||||
for (const auto& cf_iter : handler.keys_) { |
||||
uint32_t cfh_id = cf_iter.first; |
||||
auto& cfh_keys = cf_iter.second; |
||||
|
||||
for (const auto& key_iter : cfh_keys) { |
||||
const std::string& key = key_iter; |
||||
|
||||
s = txn_db_impl_->TryLock(this, cfh_id, key); |
||||
if (!s.ok()) { |
||||
break; |
||||
} |
||||
(*keys_to_unlock)[cfh_id].insert({std::move(key), kMaxSequenceNumber}); |
||||
} |
||||
|
||||
if (!s.ok()) { |
||||
break; |
||||
} |
||||
} |
||||
|
||||
if (!s.ok()) { |
||||
txn_db_impl_->UnLock(this, keys_to_unlock); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key, bool check_snapshot) { |
||||
size_t key_size = 0; |
||||
for (int i = 0; i < key.num_parts; ++i) { |
||||
key_size += key.parts[i].size(); |
||||
} |
||||
|
||||
std::string str; |
||||
str.reserve(key_size); |
||||
|
||||
for (int i = 0; i < key.num_parts; ++i) { |
||||
str.append(key.parts[i].data(), key.parts[i].size()); |
||||
} |
||||
|
||||
return TryLock(column_family, str, check_snapshot); |
||||
} |
||||
|
||||
// Attempt to lock this key.
|
||||
// Returns OK if the key has been successfully locked. Non-ok, otherwise.
|
||||
// If check_shapshot is true and this transaction has a snapshot set,
|
||||
// this key will only be locked if there have been no writes to this key since
|
||||
// the snapshot time.
|
||||
Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, |
||||
const Slice& key, bool check_snapshot) { |
||||
uint32_t cfh_id = GetColumnFamilyID(column_family); |
||||
std::string key_str = key.ToString(); |
||||
bool previously_locked; |
||||
Status s; |
||||
|
||||
// lock this key if this transactions hasn't already locked it
|
||||
auto iter = tracked_keys_[cfh_id].find(key_str); |
||||
if (iter == tracked_keys_[cfh_id].end()) { |
||||
previously_locked = false; |
||||
|
||||
s = txn_db_impl_->TryLock(this, cfh_id, key_str); |
||||
|
||||
if (s.ok()) { |
||||
// Record that we've locked this key
|
||||
auto result = tracked_keys_[cfh_id].insert({key_str, kMaxSequenceNumber}); |
||||
iter = result.first; |
||||
} |
||||
} else { |
||||
previously_locked = true; |
||||
} |
||||
|
||||
if (s.ok()) { |
||||
// If a snapshot is set, we need to make sure the key hasn't been modified
|
||||
// since the snapshot. This must be done after we locked the key.
|
||||
if (!check_snapshot || snapshot_ == nullptr) { |
||||
// Need to remember the earliest sequence number that we know that this
|
||||
// key has not been modified after. This is useful if this same
|
||||
// transaction
|
||||
// later tries to lock this key again.
|
||||
if (iter->second == kMaxSequenceNumber) { |
||||
// Since we haven't checked a snapshot, we only know this key has not
|
||||
// been modified since after we locked it.
|
||||
iter->second = db_->GetLatestSequenceNumber(); |
||||
} |
||||
} else { |
||||
// If the key has been previous validated at a sequence number earlier
|
||||
// than the curent snapshot's sequence number, we already know it has not
|
||||
// been modified.
|
||||
bool already_validated = iter->second <= snapshot_->GetSequenceNumber(); |
||||
|
||||
if (!already_validated) { |
||||
s = CheckKeySequence(column_family, key); |
||||
|
||||
if (s.ok()) { |
||||
// Record that there have been no writes to this key after this
|
||||
// sequence.
|
||||
iter->second = snapshot_->GetSequenceNumber(); |
||||
} else { |
||||
// Failed to validate key
|
||||
if (!previously_locked) { |
||||
// Unlock key we just locked
|
||||
txn_db_impl_->UnLock(this, cfh_id, key.ToString()); |
||||
tracked_keys_[cfh_id].erase(iter); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
// Return OK() if this key has not been modified more recently than the
|
||||
// transaction snapshot_.
|
||||
Status TransactionImpl::CheckKeySequence(ColumnFamilyHandle* column_family, |
||||
const Slice& key) { |
||||
Status result; |
||||
if (snapshot_ != nullptr) { |
||||
assert(dynamic_cast<DBImpl*>(db_->GetBaseDB()) != nullptr); |
||||
auto db_impl = reinterpret_cast<DBImpl*>(db_->GetBaseDB()); |
||||
|
||||
ColumnFamilyHandle* cfh = column_family ? column_family : |
||||
db_impl->DefaultColumnFamily(); |
||||
|
||||
result = TransactionUtil::CheckKeyForConflicts( |
||||
db_impl, cfh, key.ToString(), |
||||
snapshot_->GetSequenceNumber()); |
||||
} |
||||
|
||||
return result; |
||||
} |
||||
|
||||
Status TransactionImpl::Get(const ReadOptions& read_options, |
||||
ColumnFamilyHandle* column_family, const Slice& key, |
||||
std::string* value) { |
||||
return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key, |
||||
value); |
||||
} |
||||
|
||||
Status TransactionImpl::GetForUpdate(const ReadOptions& read_options, |
||||
ColumnFamilyHandle* column_family, |
||||
const Slice& key, std::string* value) { |
||||
Status s = TryLock(column_family, key); |
||||
|
||||
if (s.ok() && value != nullptr) { |
||||
s = Get(read_options, column_family, key, value); |
||||
} |
||||
return s; |
||||
} |
||||
|
||||
std::vector<Status> TransactionImpl::MultiGet( |
||||
const ReadOptions& read_options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) { |
||||
size_t num_keys = keys.size(); |
||||
values->resize(num_keys); |
||||
|
||||
std::vector<Status> stat_list(num_keys); |
||||
for (size_t i = 0; i < num_keys; ++i) { |
||||
std::string* value = values ? &(*values)[i] : nullptr; |
||||
stat_list[i] = Get(read_options, column_family[i], keys[i], value); |
||||
} |
||||
|
||||
return stat_list; |
||||
} |
||||
|
||||
std::vector<Status> TransactionImpl::MultiGetForUpdate( |
||||
const ReadOptions& read_options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) { |
||||
// Regardless of whether the MultiGet succeeded, track these keys.
|
||||
size_t num_keys = keys.size(); |
||||
values->resize(num_keys); |
||||
|
||||
// Lock all keys
|
||||
for (size_t i = 0; i < num_keys; ++i) { |
||||
Status s = TryLock(column_family[i], keys[i]); |
||||
if (!s.ok()) { |
||||
// Fail entire multiget if we cannot lock all keys
|
||||
return std::vector<Status>(num_keys, s); |
||||
} |
||||
} |
||||
|
||||
// TODO(agiardullo): optimize multiget?
|
||||
std::vector<Status> stat_list(num_keys); |
||||
for (size_t i = 0; i < num_keys; ++i) { |
||||
std::string* value = values ? &(*values)[i] : nullptr; |
||||
stat_list[i] = Get(read_options, column_family[i], keys[i], value); |
||||
} |
||||
|
||||
return stat_list; |
||||
} |
||||
|
||||
Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options) { |
||||
Iterator* db_iter = db_->NewIterator(read_options); |
||||
assert(db_iter); |
||||
|
||||
return write_batch_->NewIteratorWithBase(db_iter); |
||||
} |
||||
|
||||
Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options, |
||||
ColumnFamilyHandle* column_family) { |
||||
Iterator* db_iter = db_->NewIterator(read_options, column_family); |
||||
assert(db_iter); |
||||
|
||||
return write_batch_->NewIteratorWithBase(column_family, db_iter); |
||||
} |
||||
|
||||
Status TransactionImpl::Put(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) { |
||||
Status s = TryLock(column_family, key); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Put(column_family, key, value); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::Put(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key, const SliceParts& value) { |
||||
Status s = TryLock(column_family, key); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Put(column_family, key, value); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::Merge(ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& value) { |
||||
Status s = TryLock(column_family, key); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Merge(column_family, key, value); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::Delete(ColumnFamilyHandle* column_family, |
||||
const Slice& key) { |
||||
Status s = TryLock(column_family, key); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Delete(column_family, key); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::Delete(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) { |
||||
Status s = TryLock(column_family, key); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Delete(column_family, key); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& value) { |
||||
// Even though we do not care about doing conflict checking for this write,
|
||||
// we still need to take a lock to make sure we do not cause a conflict with
|
||||
// some other write. However, we do not need to check if there have been
|
||||
// any writes since this transaction's snapshot.
|
||||
bool check_snapshot = false; |
||||
|
||||
// TODO(agiardullo): could optimize by supporting shared txn locks in the
|
||||
// future
|
||||
Status s = TryLock(column_family, key, check_snapshot); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Put(column_family, key, value); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key, |
||||
const SliceParts& value) { |
||||
bool check_snapshot = false; |
||||
Status s = TryLock(column_family, key, check_snapshot); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Put(column_family, key, value); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::MergeUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key, const Slice& value) { |
||||
bool check_snapshot = false; |
||||
Status s = TryLock(column_family, key, check_snapshot); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Merge(column_family, key, value); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key) { |
||||
bool check_snapshot = false; |
||||
Status s = TryLock(column_family, key, check_snapshot); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Delete(column_family, key); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) { |
||||
bool check_snapshot = false; |
||||
Status s = TryLock(column_family, key, check_snapshot); |
||||
|
||||
if (s.ok()) { |
||||
write_batch_->Delete(column_family, key); |
||||
num_entries_++; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
void TransactionImpl::PutLogData(const Slice& blob) { |
||||
write_batch_->PutLogData(blob); |
||||
num_entries_++; |
||||
} |
||||
|
||||
WriteBatchWithIndex* TransactionImpl::GetWriteBatch() { |
||||
return write_batch_.get(); |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,263 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <atomic> |
||||
#include <stack> |
||||
#include <string> |
||||
#include <unordered_map> |
||||
#include <vector> |
||||
|
||||
#include "db/write_callback.h" |
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/slice.h" |
||||
#include "rocksdb/status.h" |
||||
#include "rocksdb/types.h" |
||||
#include "rocksdb/utilities/transaction.h" |
||||
#include "rocksdb/utilities/transaction_db.h" |
||||
#include "rocksdb/utilities/write_batch_with_index.h" |
||||
#include "utilities/transactions/transaction_util.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
using TransactionID = uint64_t; |
||||
|
||||
class TransactionDBImpl; |
||||
|
||||
class TransactionImpl : public Transaction { |
||||
public: |
||||
TransactionImpl(TransactionDB* db, const WriteOptions& write_options, |
||||
const TransactionOptions& txn_options); |
||||
|
||||
virtual ~TransactionImpl(); |
||||
|
||||
Status Commit() override; |
||||
|
||||
Status CommitBatch(WriteBatch* batch); |
||||
|
||||
void Rollback() override; |
||||
|
||||
void SetSavePoint() override; |
||||
|
||||
void RollbackToSavePoint() override; |
||||
|
||||
Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, |
||||
const Slice& key, std::string* value) override; |
||||
|
||||
Status Get(const ReadOptions& options, const Slice& key, |
||||
std::string* value) override { |
||||
return Get(options, db_->DefaultColumnFamily(), key, value); |
||||
} |
||||
|
||||
Status GetForUpdate(const ReadOptions& options, |
||||
ColumnFamilyHandle* column_family, const Slice& key, |
||||
std::string* value) override; |
||||
|
||||
Status GetForUpdate(const ReadOptions& options, const Slice& key, |
||||
std::string* value) override { |
||||
return GetForUpdate(options, db_->DefaultColumnFamily(), key, value); |
||||
} |
||||
|
||||
std::vector<Status> MultiGet( |
||||
const ReadOptions& options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) override; |
||||
|
||||
std::vector<Status> MultiGet(const ReadOptions& options, |
||||
const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) override { |
||||
return MultiGet(options, std::vector<ColumnFamilyHandle*>( |
||||
keys.size(), db_->DefaultColumnFamily()), |
||||
keys, values); |
||||
} |
||||
|
||||
std::vector<Status> MultiGetForUpdate( |
||||
const ReadOptions& options, |
||||
const std::vector<ColumnFamilyHandle*>& column_family, |
||||
const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) override; |
||||
|
||||
std::vector<Status> MultiGetForUpdate( |
||||
const ReadOptions& options, const std::vector<Slice>& keys, |
||||
std::vector<std::string>* values) override { |
||||
return MultiGetForUpdate(options, |
||||
std::vector<ColumnFamilyHandle*>( |
||||
keys.size(), db_->DefaultColumnFamily()), |
||||
keys, values); |
||||
} |
||||
|
||||
Iterator* GetIterator(const ReadOptions& read_options) override; |
||||
Iterator* GetIterator(const ReadOptions& read_options, |
||||
ColumnFamilyHandle* column_family) override; |
||||
|
||||
Status Put(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) override; |
||||
Status Put(const Slice& key, const Slice& value) override { |
||||
return Put(nullptr, key, value); |
||||
} |
||||
|
||||
Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, |
||||
const SliceParts& value) override; |
||||
Status Put(const SliceParts& key, const SliceParts& value) override { |
||||
return Put(nullptr, key, value); |
||||
} |
||||
|
||||
Status Merge(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) override; |
||||
Status Merge(const Slice& key, const Slice& value) override { |
||||
return Merge(nullptr, key, value); |
||||
} |
||||
|
||||
Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; |
||||
Status Delete(const Slice& key) override { return Delete(nullptr, key); } |
||||
Status Delete(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) override; |
||||
Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } |
||||
|
||||
Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) override; |
||||
Status PutUntracked(const Slice& key, const Slice& value) override { |
||||
return PutUntracked(nullptr, key, value); |
||||
} |
||||
|
||||
Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, |
||||
const SliceParts& value) override; |
||||
Status PutUntracked(const SliceParts& key, const SliceParts& value) override { |
||||
return PutUntracked(nullptr, key, value); |
||||
} |
||||
|
||||
Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, |
||||
const Slice& value) override; |
||||
Status MergeUntracked(const Slice& key, const Slice& value) override { |
||||
return MergeUntracked(nullptr, key, value); |
||||
} |
||||
|
||||
Status DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const Slice& key) override; |
||||
Status DeleteUntracked(const Slice& key) override { |
||||
return DeleteUntracked(nullptr, key); |
||||
} |
||||
Status DeleteUntracked(ColumnFamilyHandle* column_family, |
||||
const SliceParts& key) override; |
||||
Status DeleteUntracked(const SliceParts& key) override { |
||||
return DeleteUntracked(nullptr, key); |
||||
} |
||||
|
||||
void PutLogData(const Slice& blob) override; |
||||
|
||||
const Snapshot* GetSnapshot() const override { return snapshot_; } |
||||
|
||||
void SetSnapshot() override; |
||||
|
||||
WriteBatchWithIndex* GetWriteBatch() override; |
||||
|
||||
// Generate a new unique transaction identifier
|
||||
static TransactionID GenTxnID(); |
||||
|
||||
TransactionID GetTxnID() const { return txn_id_; } |
||||
|
||||
// Returns the time (in milliseconds according to Env->GetMicros()*1000)
|
||||
// that this transaction will be expired. Returns 0 if this transaction does
|
||||
// not expire.
|
||||
uint64_t GetExpirationTime() const { return expiration_time_; } |
||||
|
||||
// returns true if this transaction has an expiration_time and has expired.
|
||||
bool IsExpired() const; |
||||
|
||||
// Returns the number of milliseconds a transaction can wait on acquiring a
|
||||
// lock or -1 if there is no timeout.
|
||||
int64_t GetLockTimeout() const { return lock_timeout_; } |
||||
void SetLockTimeout(int64_t timeout) { lock_timeout_ = timeout; } |
||||
|
||||
private: |
||||
TransactionDB* const db_; |
||||
|
||||
TransactionDBImpl* txn_db_impl_; |
||||
|
||||
// Used to create unique ids for transactions.
|
||||
static std::atomic<TransactionID> txn_id_counter_; |
||||
|
||||
// Unique ID for this transaction
|
||||
const TransactionID txn_id_; |
||||
|
||||
const WriteOptions write_options_; |
||||
|
||||
// If snapshot_ is set, all keys that locked must also have not been written
|
||||
// since this snapshot
|
||||
const Snapshot* snapshot_; |
||||
|
||||
const Comparator* cmp_; |
||||
|
||||
std::unique_ptr<WriteBatchWithIndex> write_batch_; |
||||
|
||||
// If expiration_ is non-zero, start_time_ stores that time the txn was
|
||||
// constructed,
|
||||
// in milliseconds.
|
||||
const uint64_t start_time_; |
||||
|
||||
// If non-zero, this transaction should not be committed after this time (in
|
||||
// milliseconds)
|
||||
const uint64_t expiration_time_; |
||||
|
||||
// Timeout in microseconds when locking a key or -1 if there is no timeout.
|
||||
int64_t lock_timeout_; |
||||
|
||||
// Map from column_family_id to map of keys to Sequence Numbers. Stores keys
|
||||
// that have been locked.
|
||||
// The key is known to not have been modified after the Sequence Number
|
||||
// stored.
|
||||
TransactionKeyMap tracked_keys_; |
||||
|
||||
// Records the number of entries currently in the WriteBatch include calls to
|
||||
// PutLogData()
|
||||
size_t num_entries_ = 0; |
||||
|
||||
// Stack of number of entries in write_batch at each save point
|
||||
std::unique_ptr<std::stack<size_t>> save_points_; |
||||
|
||||
Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, |
||||
bool check_snapshot = true); |
||||
Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key, |
||||
bool check_snapshot = true); |
||||
void Cleanup(); |
||||
|
||||
Status CheckKeySequence(ColumnFamilyHandle* column_family, const Slice& key); |
||||
|
||||
Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock); |
||||
|
||||
Status DoCommit(WriteBatch* batch); |
||||
|
||||
void RollbackLastN(size_t num); |
||||
|
||||
// No copying allowed
|
||||
TransactionImpl(const TransactionImpl&); |
||||
void operator=(const TransactionImpl&); |
||||
}; |
||||
|
||||
// Used at commit time to check whether transaction is committing before its
|
||||
// expiration time.
|
||||
class TransactionCallback : public WriteCallback { |
||||
public: |
||||
explicit TransactionCallback(TransactionImpl* txn) : txn_(txn) {} |
||||
|
||||
Status Callback(DB* db) override { |
||||
if (txn_->IsExpired()) { |
||||
return Status::TimedOut(); |
||||
} else { |
||||
return Status::OK(); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
TransactionImpl* txn_; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,443 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#ifndef __STDC_FORMAT_MACROS |
||||
#define __STDC_FORMAT_MACROS |
||||
#endif |
||||
|
||||
#include "utilities/transactions/transaction_lock_mgr.h" |
||||
|
||||
#include <inttypes.h> |
||||
|
||||
#include <algorithm> |
||||
#include <condition_variable> |
||||
#include <functional> |
||||
#include <mutex> |
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "rocksdb/slice.h" |
||||
#include "util/autovector.h" |
||||
#include "util/murmurhash.h" |
||||
#include "util/thread_local.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
struct LockInfo { |
||||
TransactionID txn_id; |
||||
uint64_t |
||||
expiration_time; // Transaction locks are not valid after this time in ms
|
||||
LockInfo(TransactionID id, uint64_t time) |
||||
: txn_id(id), expiration_time(time) {} |
||||
LockInfo(const LockInfo& lock_info) |
||||
: txn_id(lock_info.txn_id), expiration_time(lock_info.expiration_time) {} |
||||
}; |
||||
|
||||
struct LockMapStripe { |
||||
// Mutex must be held before modifying keys map
|
||||
std::timed_mutex stripe_mutex; |
||||
|
||||
// Condition Variable per stripe for waiting on a lock
|
||||
std::condition_variable_any stripe_cv; |
||||
|
||||
// Locked keys mapped to the info about the transactions that locked them.
|
||||
// TODO(agiardullo): Explore performance of other data structures.
|
||||
std::unordered_map<std::string, LockInfo> keys; |
||||
}; |
||||
|
||||
// Map of #num_stripes LockMapStripes
|
||||
struct LockMap { |
||||
explicit LockMap(size_t num_stripes) |
||||
: num_stripes_(num_stripes), lock_map_stripes_(num_stripes) {} |
||||
|
||||
LockMap(const LockMap& lock_map) |
||||
: num_stripes_(lock_map.num_stripes_), lock_map_stripes_(num_stripes_) {} |
||||
|
||||
// Number of sepearate LockMapStripes to create, each with their own Mutex
|
||||
const size_t num_stripes_; |
||||
|
||||
// Count of keys that are currently locked in this column family.
|
||||
// (Only maintained if TransactionLockMgr::max_num_locks_ is positive.)
|
||||
std::atomic<int64_t> lock_cnt{0}; |
||||
|
||||
std::vector<LockMapStripe> lock_map_stripes_; |
||||
|
||||
size_t GetStripe(const std::string& key) const; |
||||
}; |
||||
|
||||
namespace { |
||||
void UnrefLockMapsCache(void* ptr) { |
||||
// Called when a thread exits or a ThreadLocalPtr gets destroyed.
|
||||
auto lock_maps_cache = |
||||
static_cast<std::unordered_map<uint32_t, std::shared_ptr<LockMap>>*>(ptr); |
||||
delete lock_maps_cache; |
||||
} |
||||
} // anonymous namespace
|
||||
|
||||
TransactionLockMgr::TransactionLockMgr(size_t default_num_stripes, |
||||
int64_t max_num_locks) |
||||
: default_num_stripes_(default_num_stripes), |
||||
max_num_locks_(max_num_locks), |
||||
lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)) {} |
||||
|
||||
TransactionLockMgr::~TransactionLockMgr() {} |
||||
|
||||
size_t LockMap::GetStripe(const std::string& key) const { |
||||
assert(num_stripes_ > 0); |
||||
static murmur_hash hash; |
||||
size_t stripe = hash(key) % num_stripes_; |
||||
return stripe; |
||||
} |
||||
|
||||
void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) { |
||||
InstrumentedMutexLock l(&lock_map_mutex_); |
||||
|
||||
if (lock_maps_.find(column_family_id) == lock_maps_.end()) { |
||||
lock_maps_.emplace( |
||||
column_family_id, |
||||
std::shared_ptr<LockMap>(new LockMap(default_num_stripes_))); |
||||
} else { |
||||
// column_family already exists in lock map
|
||||
assert(false); |
||||
} |
||||
} |
||||
|
||||
void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) { |
||||
// Remove lock_map for this column family. Since the lock map is stored
|
||||
// as a shared ptr, concurrent transactions can still keep keep using it
|
||||
// until they release their reference to it.
|
||||
{ |
||||
InstrumentedMutexLock l(&lock_map_mutex_); |
||||
|
||||
auto lock_maps_iter = lock_maps_.find(column_family_id); |
||||
assert(lock_maps_iter != lock_maps_.end()); |
||||
|
||||
lock_maps_.erase(lock_maps_iter); |
||||
} // lock_map_mutex_
|
||||
|
||||
// Clear all thread-local caches
|
||||
autovector<void*> local_caches; |
||||
lock_maps_cache_->Scrape(&local_caches, nullptr); |
||||
for (auto cache : local_caches) { |
||||
delete static_cast<LockMaps*>(cache); |
||||
} |
||||
} |
||||
|
||||
// Look up the LockMap shared_ptr for a given column_family_id.
|
||||
// Note: The LockMap is only valid as long as the caller is still holding on
|
||||
// to the returned shared_ptr.
|
||||
std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap( |
||||
uint32_t column_family_id) { |
||||
// First check thread-local cache
|
||||
if (lock_maps_cache_->Get() == nullptr) { |
||||
lock_maps_cache_->Reset(new LockMaps()); |
||||
} |
||||
|
||||
auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get()); |
||||
|
||||
auto lock_map_iter = lock_maps_cache->find(column_family_id); |
||||
if (lock_map_iter != lock_maps_cache->end()) { |
||||
// Found lock map for this column family.
|
||||
return lock_map_iter->second; |
||||
} |
||||
|
||||
// Not found in local cache, grab mutex and check shared LockMaps
|
||||
InstrumentedMutexLock l(&lock_map_mutex_); |
||||
|
||||
lock_map_iter = lock_maps_.find(column_family_id); |
||||
if (lock_map_iter == lock_maps_.end()) { |
||||
return std::shared_ptr<LockMap>(nullptr); |
||||
} else { |
||||
// Found lock map. Store in thread-local cache and return.
|
||||
std::shared_ptr<LockMap>& lock_map = lock_map_iter->second; |
||||
lock_maps_cache->insert({column_family_id, lock_map}); |
||||
|
||||
return lock_map; |
||||
} |
||||
} |
||||
|
||||
// Returns true if this lock has expired and can be acquired by another
|
||||
// transaction.
|
||||
// If false, returns the number of microseconds until expiration in
|
||||
// *wait_time_us, or 0 if no expiration.
|
||||
bool TransactionLockMgr::IsLockExpired(const LockInfo& lock_info, Env* env, |
||||
uint64_t* wait_time_us) { |
||||
auto now = env->NowMicros(); |
||||
|
||||
bool expired = (lock_info.expiration_time > 0 && |
||||
lock_info.expiration_time * 1000 <= now); |
||||
|
||||
if (!expired && lock_info.expiration_time > 0 && wait_time_us != nullptr) { |
||||
// return how many microseconds until lock will be expired
|
||||
*wait_time_us = (lock_info.expiration_time * 1000 - now); |
||||
} |
||||
|
||||
return expired; |
||||
} |
||||
|
||||
Status TransactionLockMgr::TryLock(const TransactionImpl* txn, |
||||
uint32_t column_family_id, |
||||
const std::string& key, Env* env) { |
||||
// Lookup lock map for this column family id
|
||||
std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id); |
||||
LockMap* lock_map = lock_map_ptr.get(); |
||||
if (lock_map == nullptr) { |
||||
char msg[255]; |
||||
snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32, |
||||
column_family_id); |
||||
|
||||
return Status::InvalidArgument(msg); |
||||
} |
||||
|
||||
// Need to lock the mutex for the stripe that this key hashes to
|
||||
size_t stripe_num = lock_map->GetStripe(key); |
||||
assert(lock_map->lock_map_stripes_.size() > stripe_num); |
||||
LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); |
||||
|
||||
LockInfo lock_info(txn->GetTxnID(), txn->GetExpirationTime()); |
||||
int64_t timeout = txn->GetLockTimeout(); |
||||
|
||||
return AcquireWithTimeout(lock_map, stripe, key, env, timeout, lock_info); |
||||
} |
||||
|
||||
// Helper function for TryLock().
|
||||
Status TransactionLockMgr::AcquireWithTimeout(LockMap* lock_map, |
||||
LockMapStripe* stripe, |
||||
const std::string& key, Env* env, |
||||
int64_t timeout, |
||||
const LockInfo& lock_info) { |
||||
std::chrono::system_clock::time_point end_time; |
||||
|
||||
if (timeout > 0) { |
||||
end_time = |
||||
std::chrono::system_clock::now() + std::chrono::milliseconds(timeout); |
||||
} |
||||
|
||||
bool locked = true; |
||||
if (timeout == 0) { |
||||
// If timeout is 0, we do not wait to acquire the lock if it is not
|
||||
// available
|
||||
locked = stripe->stripe_mutex.try_lock(); |
||||
} else if (timeout < 0) { |
||||
// If timeout is negative, we wait indefinitely to acquire the lock
|
||||
stripe->stripe_mutex.lock(); |
||||
} else { |
||||
// If timeout is positive, we attempt to acquire the lock unless we timeout
|
||||
locked = stripe->stripe_mutex.try_lock_until(end_time); |
||||
} |
||||
|
||||
if (!locked) { |
||||
// timeout acquiring mutex
|
||||
return Status::Busy(); |
||||
} |
||||
|
||||
// Acquire lock if we are able to
|
||||
uint64_t wait_time_us = 0; |
||||
Status result = |
||||
AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us); |
||||
|
||||
if (result.IsBusy() && timeout != 0) { |
||||
// If we weren't able to acquire the lock, we will keep retrying as long
|
||||
// as the
|
||||
// timeout allows.
|
||||
bool timed_out = false; |
||||
do { |
||||
// Check to see if the lock expires sooner than our timeout.
|
||||
std::chrono::system_clock::time_point wait_time_end; |
||||
if (wait_time_us > 0 && |
||||
(timeout < 0 || |
||||
wait_time_us < static_cast<uint64_t>(timeout * 1000))) { |
||||
wait_time_end = std::chrono::system_clock::now() + |
||||
std::chrono::microseconds(wait_time_us); |
||||
if (timeout > 0 && wait_time_end >= end_time) { |
||||
// lock expiration time is after our timeout.
|
||||
wait_time_us = 0; |
||||
} |
||||
} else { |
||||
wait_time_us = 0; |
||||
} |
||||
|
||||
if (wait_time_us > 0) { |
||||
// Wait up to the locks current expiration time
|
||||
stripe->stripe_cv.wait_until(stripe->stripe_mutex, wait_time_end); |
||||
} else if (timeout > 0) { |
||||
// Wait until we timeout
|
||||
auto cv_status = |
||||
stripe->stripe_cv.wait_until(stripe->stripe_mutex, end_time); |
||||
|
||||
if (cv_status == std::cv_status::timeout) { |
||||
timed_out = true; |
||||
// Even though we timed out, we will still make one more attempt to
|
||||
// acquire lock below (it is possible the lock expired and we
|
||||
// were never signaled).
|
||||
} |
||||
} else { |
||||
// No wait timeout.
|
||||
stripe->stripe_cv.wait(stripe->stripe_mutex); |
||||
} |
||||
|
||||
result = |
||||
AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us); |
||||
} while (result.IsBusy() && !timed_out); |
||||
} |
||||
|
||||
stripe->stripe_mutex.unlock(); |
||||
|
||||
return result; |
||||
} |
||||
|
||||
// Try to lock this key after we have acquired the mutex.
|
||||
// Returns the number of microseconds until expiration in *wait_time_us,
|
||||
// or 0 if no expiration.
|
||||
// REQUIRED: Stripe mutex must be held.
|
||||
Status TransactionLockMgr::AcquireLocked(LockMap* lock_map, |
||||
LockMapStripe* stripe, |
||||
const std::string& key, Env* env, |
||||
const LockInfo& txn_lock_info, |
||||
uint64_t* wait_time_us) { |
||||
Status result; |
||||
// Check if this key is already locked
|
||||
if (stripe->keys.find(key) != stripe->keys.end()) { |
||||
// Lock already held
|
||||
|
||||
LockInfo& lock_info = stripe->keys.at(key); |
||||
if (lock_info.txn_id != txn_lock_info.txn_id) { |
||||
// locked by another txn. Check if it's expired
|
||||
if (IsLockExpired(lock_info, env, wait_time_us)) { |
||||
// lock is expired, can steal it
|
||||
lock_info.txn_id = txn_lock_info.txn_id; |
||||
lock_info.expiration_time = txn_lock_info.expiration_time; |
||||
// lock_cnt does not change
|
||||
} else { |
||||
result = Status::Busy(); |
||||
} |
||||
} |
||||
} else { // Lock not held.
|
||||
// Check lock limit
|
||||
if (max_num_locks_ > 0 && |
||||
lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) { |
||||
result = |
||||
Status::Busy("Failed to acquire lock due to max_num_locks limit"); |
||||
} else { |
||||
// acquire lock
|
||||
stripe->keys.insert({key, txn_lock_info}); |
||||
|
||||
// Maintain lock count if there is a limit on the number of locks
|
||||
if (max_num_locks_) { |
||||
lock_map->lock_cnt++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
return result; |
||||
} |
||||
|
||||
void TransactionLockMgr::UnLock(TransactionImpl* txn, uint32_t column_family_id, |
||||
const std::string& key, Env* env) { |
||||
std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id); |
||||
LockMap* lock_map = lock_map_ptr.get(); |
||||
if (lock_map == nullptr) { |
||||
// Column Family must have been dropped.
|
||||
return; |
||||
} |
||||
|
||||
// Lock the mutex for the stripe that this key hashes to
|
||||
size_t stripe_num = lock_map->GetStripe(key); |
||||
assert(lock_map->lock_map_stripes_.size() > stripe_num); |
||||
LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); |
||||
|
||||
TransactionID txn_id = txn->GetTxnID(); |
||||
{ |
||||
std::lock_guard<std::timed_mutex> lock(stripe->stripe_mutex); |
||||
|
||||
const auto& iter = stripe->keys.find(key); |
||||
if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) { |
||||
// Found the key we locked. unlock it.
|
||||
stripe->keys.erase(iter); |
||||
if (max_num_locks_ > 0) { |
||||
// Maintain lock count if there is a limit on the number of locks.
|
||||
assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); |
||||
lock_map->lock_cnt--; |
||||
} |
||||
} else { |
||||
// This key is either not locked or locked by someone else. This should
|
||||
// only happen if the unlocking transaction has expired.
|
||||
assert(txn->GetExpirationTime() > 0 && |
||||
txn->GetExpirationTime() * 1000 < env->NowMicros()); |
||||
} |
||||
} // stripe_mutex unlocked
|
||||
|
||||
// Signal waiting threads to retry locking
|
||||
stripe->stripe_cv.notify_all(); |
||||
} |
||||
|
||||
void TransactionLockMgr::UnLock(const TransactionImpl* txn, |
||||
const TransactionKeyMap* key_map, Env* env) { |
||||
TransactionID txn_id = txn->GetTxnID(); |
||||
|
||||
for (auto& key_map_iter : *key_map) { |
||||
uint32_t column_family_id = key_map_iter.first; |
||||
auto& keys = key_map_iter.second; |
||||
|
||||
std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id); |
||||
LockMap* lock_map = lock_map_ptr.get(); |
||||
|
||||
if (lock_map == nullptr) { |
||||
// Column Family must have been dropped.
|
||||
return; |
||||
} |
||||
|
||||
// Bucket keys by lock_map_ stripe
|
||||
std::unordered_map<size_t, std::vector<const std::string*>> keys_by_stripe( |
||||
std::max(keys.size(), lock_map->num_stripes_)); |
||||
|
||||
for (auto& key_iter : keys) { |
||||
const std::string& key = key_iter.first; |
||||
|
||||
size_t stripe_num = lock_map->GetStripe(key); |
||||
keys_by_stripe[stripe_num].push_back(&key); |
||||
} |
||||
|
||||
// For each stripe, grab the stripe mutex and unlock all keys in this stripe
|
||||
for (auto& stripe_iter : keys_by_stripe) { |
||||
size_t stripe_num = stripe_iter.first; |
||||
auto& stripe_keys = stripe_iter.second; |
||||
|
||||
assert(lock_map->lock_map_stripes_.size() > stripe_num); |
||||
LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); |
||||
|
||||
{ |
||||
std::lock_guard<std::timed_mutex> lock(stripe->stripe_mutex); |
||||
|
||||
for (const std::string* key : stripe_keys) { |
||||
const auto& iter = stripe->keys.find(*key); |
||||
if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) { |
||||
// Found the key we locked. unlock it.
|
||||
stripe->keys.erase(iter); |
||||
if (max_num_locks_ > 0) { |
||||
// Maintain lock count if there is a limit on the number of locks.
|
||||
assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); |
||||
lock_map->lock_cnt--; |
||||
} |
||||
} else { |
||||
// This key is either not locked or locked by someone else. This
|
||||
// should only
|
||||
// happen if the unlocking transaction has expired.
|
||||
assert(txn->GetExpirationTime() > 0 && |
||||
txn->GetExpirationTime() * 1000 < env->NowMicros()); |
||||
} |
||||
} |
||||
} // stripe_mutex unlocked
|
||||
|
||||
// Signal waiting threads to retry locking
|
||||
stripe->stripe_cv.notify_all(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,90 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <chrono> |
||||
#include <string> |
||||
#include <unordered_map> |
||||
#include <vector> |
||||
|
||||
#include "rocksdb/utilities/transaction.h" |
||||
#include "util/instrumented_mutex.h" |
||||
#include "util/thread_local.h" |
||||
#include "utilities/transactions/transaction_impl.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class ColumnFamilyHandle; |
||||
struct LockInfo; |
||||
struct LockMap; |
||||
struct LockMapStripe; |
||||
|
||||
class Slice; |
||||
|
||||
class TransactionLockMgr { |
||||
public: |
||||
TransactionLockMgr(size_t default_num_stripes, int64_t max_num_locks); |
||||
|
||||
~TransactionLockMgr(); |
||||
|
||||
// Creates a new LockMap for this column family. Caller should guarantee
|
||||
// that this column family does not already exist.
|
||||
void AddColumnFamily(uint32_t column_family_id); |
||||
|
||||
// Deletes the LockMap for this column family. Caller should guarantee that
|
||||
// this column family is no longer in use.
|
||||
void RemoveColumnFamily(uint32_t column_family_id); |
||||
|
||||
// Attempt to lock key. If OK status is returned, the caller is responsible
|
||||
// for calling UnLock() on this key.
|
||||
Status TryLock(const TransactionImpl* txn, uint32_t column_family_id, |
||||
const std::string& key, Env* env); |
||||
|
||||
// Unlock a key locked by TryLock(). txn must be the same Transaction that
|
||||
// locked this key.
|
||||
void UnLock(const TransactionImpl* txn, const TransactionKeyMap* keys, |
||||
Env* env); |
||||
void UnLock(TransactionImpl* txn, uint32_t column_family_id, |
||||
const std::string& key, Env* env); |
||||
|
||||
private: |
||||
// Default number of lock map stripes per column family
|
||||
const size_t default_num_stripes_; |
||||
|
||||
// Limit on number of keys locked per column family
|
||||
const int64_t max_num_locks_; |
||||
|
||||
// Must be held when accessing/modifying lock_maps_
|
||||
InstrumentedMutex lock_map_mutex_; |
||||
|
||||
// Map of ColumnFamilyId to locked key info
|
||||
using LockMaps = std::unordered_map<uint32_t, std::shared_ptr<LockMap>>; |
||||
LockMaps lock_maps_; |
||||
|
||||
// Thread-local cache of entries in lock_maps_. This is an optimization
|
||||
// to avoid acquiring a mutex in order to look up a LockMap
|
||||
std::unique_ptr<ThreadLocalPtr> lock_maps_cache_; |
||||
|
||||
bool IsLockExpired(const LockInfo& lock_info, Env* env, uint64_t* wait_time); |
||||
|
||||
std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id); |
||||
|
||||
Status AcquireWithTimeout(LockMap* lock_map, LockMapStripe* stripe, |
||||
const std::string& key, Env* env, int64_t timeout, |
||||
const LockInfo& lock_info); |
||||
|
||||
Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, |
||||
const std::string& key, Env* env, |
||||
const LockInfo& lock_info, uint64_t* wait_time); |
||||
|
||||
// No copying allowed
|
||||
TransactionLockMgr(const TransactionLockMgr&); |
||||
void operator=(const TransactionLockMgr&); |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,265 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#ifndef __STDC_FORMAT_MACROS |
||||
#define __STDC_FORMAT_MACROS |
||||
#endif |
||||
|
||||
#include "utilities/transactions/transaction_util.h" |
||||
|
||||
#include <inttypes.h> |
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "db/db_impl.h" |
||||
#include "rocksdb/status.h" |
||||
#include "rocksdb/utilities/write_batch_with_index.h" |
||||
#include "util/string_util.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl, |
||||
ColumnFamilyHandle* column_family, |
||||
const std::string& key, |
||||
SequenceNumber key_seq) { |
||||
Status result; |
||||
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family); |
||||
auto cfd = cfh->cfd(); |
||||
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd); |
||||
|
||||
if (sv == nullptr) { |
||||
result = Status::Busy("Could not access column family " + |
||||
cfh->GetName()); |
||||
} |
||||
|
||||
if (result.ok()) { |
||||
SequenceNumber earliest_seq = |
||||
db_impl->GetEarliestMemTableSequenceNumber(sv, true); |
||||
|
||||
result = CheckKey(db_impl, sv, earliest_seq, key_seq, key); |
||||
|
||||
db_impl->ReturnAndCleanupSuperVersion(cfd, sv); |
||||
} |
||||
|
||||
return result; |
||||
} |
||||
|
||||
Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, |
||||
SequenceNumber earliest_seq, |
||||
SequenceNumber key_seq, |
||||
const std::string& key) { |
||||
Status result; |
||||
|
||||
// Since it would be too slow to check the SST files, we will only use
|
||||
// the memtables to check whether there have been any recent writes
|
||||
// to this key after it was accessed in this transaction. But if the
|
||||
// Memtables do not contain a long enough history, we must fail the
|
||||
// transaction.
|
||||
if (earliest_seq == kMaxSequenceNumber) { |
||||
// The age of this memtable is unknown. Cannot rely on it to check
|
||||
// for recent writes. This error shouldn't happen often in practice as
|
||||
// the
|
||||
// Memtable should have a valid earliest sequence number except in some
|
||||
// corner cases (such as error cases during recovery).
|
||||
result = Status::Busy( |
||||
"Transaction ould not check for conflicts as the MemTable does not " |
||||
"countain a long enough history to check write at SequenceNumber: ", |
||||
ToString(key_seq)); |
||||
|
||||
} else if (key_seq < earliest_seq) { |
||||
// The age of this memtable is too new to use to check for recent
|
||||
// writes.
|
||||
char msg[255]; |
||||
snprintf(msg, sizeof(msg), |
||||
"Transaction could not check for conflicts for opearation at " |
||||
"SequenceNumber %" PRIu64 |
||||
" as the MemTable only contains changes newer than SequenceNumber " |
||||
"%" PRIu64 |
||||
". Increasing the value of the " |
||||
"max_write_buffer_number_to_maintain option could reduce the " |
||||
"frequency " |
||||
"of this error.", |
||||
key_seq, earliest_seq); |
||||
result = Status::Busy(msg); |
||||
} else { |
||||
SequenceNumber seq = kMaxSequenceNumber; |
||||
Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq); |
||||
if (!s.ok()) { |
||||
result = s; |
||||
} else if (seq != kMaxSequenceNumber && seq > key_seq) { |
||||
result = Status::Busy(); |
||||
} |
||||
} |
||||
|
||||
return result; |
||||
} |
||||
|
||||
Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, |
||||
TransactionKeyMap* key_map) { |
||||
Status result; |
||||
|
||||
for (auto& key_map_iter : *key_map) { |
||||
uint32_t cf_id = key_map_iter.first; |
||||
const auto& keys = key_map_iter.second; |
||||
|
||||
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id); |
||||
if (sv == nullptr) { |
||||
result = |
||||
Status::Busy("Could not access column family " + ToString(cf_id)); |
||||
break; |
||||
} |
||||
|
||||
SequenceNumber earliest_seq = |
||||
db_impl->GetEarliestMemTableSequenceNumber(sv, true); |
||||
|
||||
// For each of the keys in this transaction, check to see if someone has
|
||||
// written to this key since the start of the transaction.
|
||||
for (const auto& key_iter : keys) { |
||||
const auto& key = key_iter.first; |
||||
const SequenceNumber key_seq = key_iter.second; |
||||
|
||||
result = CheckKey(db_impl, sv, earliest_seq, key_seq, key); |
||||
|
||||
if (!result.ok()) { |
||||
break; |
||||
} |
||||
} |
||||
|
||||
db_impl->ReturnAndCleanupSuperVersion(cf_id, sv); |
||||
|
||||
if (!result.ok()) { |
||||
break; |
||||
} |
||||
} |
||||
|
||||
return result; |
||||
} |
||||
|
||||
Status TransactionUtil::CopyFirstN(size_t num, WriteBatchWithIndex* batch, |
||||
WriteBatchWithIndex* new_batch, |
||||
DBImpl* db_impl) { |
||||
// Handler for iterating through batch and copying entries to new_batch
|
||||
class Handler : public WriteBatch::Handler { |
||||
public: |
||||
WriteBatchWithIndex* batch; |
||||
const size_t limit; |
||||
DBImpl* db_impl; |
||||
size_t seen = 0; |
||||
std::unordered_map<uint32_t, SuperVersion*> super_versions; |
||||
std::unordered_map<uint32_t, ColumnFamilyHandle*> handles; |
||||
|
||||
Handler(WriteBatchWithIndex* dest, size_t new_limit, DBImpl* db) |
||||
: batch(dest), limit(new_limit), db_impl(db) {} |
||||
|
||||
~Handler() { |
||||
for (auto& iter : super_versions) { |
||||
db_impl->ReturnAndCleanupSuperVersionUnlocked(iter.first, iter.second); |
||||
} |
||||
} |
||||
|
||||
Status GetColumnFamily(uint32_t column_family_id, |
||||
ColumnFamilyHandle** cfh) { |
||||
// Need to look up ColumnFamilyHandle for this column family id. Since
|
||||
// doing this requires grabbing a mutex, lets only do it once per column
|
||||
// family and cache it.
|
||||
// In order to ensure that the ColumnFamilyHandle is still valid, we need
|
||||
// to hold the superversion.
|
||||
const auto& iter = handles.find(column_family_id); |
||||
if (iter == handles.end()) { |
||||
// Don't have ColumnFamilyHandle cached, look it up from the db.
|
||||
SuperVersion* sv = |
||||
db_impl->GetAndRefSuperVersionUnlocked(column_family_id); |
||||
if (sv == nullptr) { |
||||
return Status::InvalidArgument( |
||||
"Could not find column family for ID " + |
||||
ToString(column_family_id)); |
||||
} |
||||
super_versions.insert({column_family_id, sv}); |
||||
|
||||
*cfh = db_impl->GetColumnFamilyHandleUnlocked(column_family_id); |
||||
if (*cfh == nullptr) { |
||||
return Status::InvalidArgument( |
||||
"Could not find column family handle for ID " + |
||||
ToString(column_family_id)); |
||||
} |
||||
handles.insert({column_family_id, *cfh}); |
||||
} else { |
||||
*cfh = iter->second; |
||||
} |
||||
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key, |
||||
const Slice& value) override { |
||||
if (seen >= limit) { |
||||
// Found the first N entries, return Aborted to stop the Iteration.
|
||||
return Status::Aborted(); |
||||
} |
||||
ColumnFamilyHandle* cfh = nullptr; |
||||
Status s = GetColumnFamily(column_family_id, &cfh); |
||||
if (s.ok()) { |
||||
batch->Put(cfh, key, value); |
||||
} |
||||
seen++; |
||||
return s; |
||||
} |
||||
virtual Status MergeCF(uint32_t column_family_id, const Slice& key, |
||||
const Slice& value) override { |
||||
if (seen >= limit) { |
||||
// Found the first N entries, return Aborted to stop the Iteration.
|
||||
return Status::Aborted(); |
||||
} |
||||
ColumnFamilyHandle* cfh = nullptr; |
||||
Status s = GetColumnFamily(column_family_id, &cfh); |
||||
if (s.ok()) { |
||||
batch->Merge(cfh, key, value); |
||||
} |
||||
seen++; |
||||
return s; |
||||
} |
||||
virtual Status DeleteCF(uint32_t column_family_id, |
||||
const Slice& key) override { |
||||
if (seen >= limit) { |
||||
// Found the first N entries, return Aborted to stop the Iteration.
|
||||
return Status::Aborted(); |
||||
} |
||||
ColumnFamilyHandle* cfh = nullptr; |
||||
Status s = GetColumnFamily(column_family_id, &cfh); |
||||
if (s.ok()) { |
||||
batch->Delete(cfh, key); |
||||
} |
||||
seen++; |
||||
return s; |
||||
} |
||||
|
||||
virtual void LogData(const Slice& blob) override { |
||||
if (seen < limit) { |
||||
batch->PutLogData(blob); |
||||
} |
||||
seen++; |
||||
} |
||||
}; |
||||
|
||||
// Iterating on this handler will add all keys in this batch into a new batch
|
||||
// up to
|
||||
// the limit.
|
||||
Handler handler(new_batch, num, db_impl); |
||||
Status s = batch->GetWriteBatch()->Iterate(&handler); |
||||
|
||||
if (s.IsAborted()) { |
||||
// Handler returns Aborted when it is done copying to stop the iteration.
|
||||
s = Status::OK(); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,65 @@ |
||||
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <string> |
||||
#include <unordered_map> |
||||
|
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/slice.h" |
||||
#include "rocksdb/status.h" |
||||
#include "rocksdb/types.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
using TransactionKeyMap = |
||||
std::unordered_map<uint32_t, |
||||
std::unordered_map<std::string, SequenceNumber>>; |
||||
|
||||
class DBImpl; |
||||
struct SuperVersion; |
||||
class WriteBatchWithIndex; |
||||
|
||||
class TransactionUtil { |
||||
public: |
||||
// Verifies there have been no writes to this key in the db since this
|
||||
// sequence number.
|
||||
//
|
||||
// Returns OK on success, BUSY if there is a conflicting write, or other error
|
||||
// status for any unexpected errors.
|
||||
static Status CheckKeyForConflicts(DBImpl* db_impl, |
||||
ColumnFamilyHandle* column_family, |
||||
const std::string& key, |
||||
SequenceNumber key_seq); |
||||
|
||||
// For each key,SequenceNumber pair in the TransactionKeyMap, this function
|
||||
// will verify there have been no writes to the key in the db since that
|
||||
// sequence number.
|
||||
//
|
||||
// Returns OK on success, BUSY if there is a conflicting write, or other error
|
||||
// status for any unexpected errors.
|
||||
//
|
||||
// REQUIRED: this function should only be called on the write thread or if the
|
||||
// mutex is held.
|
||||
static Status CheckKeysForConflicts(DBImpl* db_impl, TransactionKeyMap* keys); |
||||
|
||||
// Copies the first num entries from batch into new_batch (including Put,
|
||||
// Merge, Delete, and PutLogData).
|
||||
// Returns non-OK on error.
|
||||
static Status CopyFirstN(size_t num, WriteBatchWithIndex* batch, |
||||
WriteBatchWithIndex* new_batch, DBImpl* db_impl); |
||||
|
||||
private: |
||||
static Status CheckKey(DBImpl* db_impl, SuperVersion* sv, |
||||
SequenceNumber earliest_seq, SequenceNumber key_seq, |
||||
const std::string& key); |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
Loading…
Reference in new issue