|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <memory>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/utilities/optimistic_transaction_db.h"
|
|
|
|
#include "util/cast_util.h"
|
|
|
|
#include "util/mutexlock.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
class OccLockBucketsImplBase : public OccLockBuckets {
|
|
|
|
public:
|
|
|
|
virtual port::Mutex& GetLockBucket(const Slice& key, uint64_t seed) = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <bool cache_aligned>
|
|
|
|
class OccLockBucketsImpl : public OccLockBucketsImplBase {
|
|
|
|
public:
|
|
|
|
explicit OccLockBucketsImpl(size_t bucket_count) : locks_(bucket_count) {}
|
|
|
|
port::Mutex& GetLockBucket(const Slice& key, uint64_t seed) override {
|
|
|
|
return locks_.Get(key, seed);
|
|
|
|
}
|
|
|
|
size_t ApproximateMemoryUsage() const override {
|
|
|
|
return locks_.ApproximateMemoryUsage();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
// TODO: investigate optionally using folly::MicroLock to majorly save space
|
|
|
|
using M = std::conditional_t<cache_aligned, CacheAlignedWrapper<port::Mutex>,
|
|
|
|
port::Mutex>;
|
|
|
|
Striped<M> locks_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
|
|
|
|
public:
|
|
|
|
explicit OptimisticTransactionDBImpl(
|
|
|
|
DB* db, const OptimisticTransactionDBOptions& occ_options,
|
|
|
|
bool take_ownership = true)
|
|
|
|
: OptimisticTransactionDB(db),
|
|
|
|
db_owner_(take_ownership),
|
|
|
|
validate_policy_(occ_options.validate_policy) {
|
|
|
|
if (validate_policy_ == OccValidationPolicy::kValidateParallel) {
|
|
|
|
auto bucketed_locks = occ_options.shared_lock_buckets;
|
|
|
|
if (!bucketed_locks) {
|
|
|
|
uint32_t bucket_count = std::max(16u, occ_options.occ_lock_buckets);
|
|
|
|
bucketed_locks = MakeSharedOccLockBuckets(bucket_count);
|
|
|
|
}
|
|
|
|
bucketed_locks_ = static_cast_with_check<OccLockBucketsImplBase>(
|
|
|
|
std::move(bucketed_locks));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
~OptimisticTransactionDBImpl() {
|
|
|
|
// Prevent this stackable from destroying
|
|
|
|
// base db
|
|
|
|
if (!db_owner_) {
|
|
|
|
db_ = nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Transaction* BeginTransaction(const WriteOptions& write_options,
|
|
|
|
const OptimisticTransactionOptions& txn_options,
|
|
|
|
Transaction* old_txn) override;
|
|
|
|
|
|
|
|
// Transactional `DeleteRange()` is not yet supported.
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
using StackableDB::DeleteRange;
|
|
|
|
virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
|
|
|
|
const Slice&, const Slice&) override {
|
|
|
|
return Status::NotSupported();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Range deletions also must not be snuck into `WriteBatch`es as they are
|
|
|
|
// incompatible with `OptimisticTransactionDB`.
|
|
|
|
virtual Status Write(const WriteOptions& write_opts,
|
|
|
|
WriteBatch* batch) override {
|
|
|
|
if (batch->HasDeleteRange()) {
|
|
|
|
return Status::NotSupported();
|
|
|
|
}
|
|
|
|
return OptimisticTransactionDB::Write(write_opts, batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
OccValidationPolicy GetValidatePolicy() const { return validate_policy_; }
|
|
|
|
|
|
|
|
port::Mutex& GetLockBucket(const Slice& key, uint64_t seed) {
|
|
|
|
return bucketed_locks_->GetLockBucket(key, seed);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::shared_ptr<OccLockBucketsImplBase> bucketed_locks_;
|
|
|
|
|
|
|
|
bool db_owner_;
|
|
|
|
|
|
|
|
const OccValidationPolicy validate_policy_;
|
|
|
|
|
|
|
|
void ReinitializeTransaction(Transaction* txn,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
const OptimisticTransactionOptions& txn_options =
|
|
|
|
OptimisticTransactionOptions());
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|