|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "table/get_context.h"
|
|
|
|
|
|
|
|
#include "db/blob//blob_fetcher.h"
|
|
|
|
#include "db/merge_helper.h"
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
9 years ago
|
|
|
#include "db/pinned_iterators_manager.h"
|
|
|
|
#include "db/read_callback.h"
|
|
|
|
#include "monitoring/file_read_sample.h"
|
|
|
|
#include "monitoring/perf_context_imp.h"
|
|
|
|
#include "monitoring/statistics.h"
|
|
|
|
#include "rocksdb/merge_operator.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/system_clock.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (replay_log) {
|
|
|
|
if (replay_log->empty()) {
|
|
|
|
// Optimization: in the common case of only one operation in the
|
|
|
|
// log, we allocate the exact amount of space needed.
|
|
|
|
replay_log->reserve(1 + VarintLength(value.size()) + value.size());
|
|
|
|
}
|
|
|
|
replay_log->push_back(type);
|
|
|
|
PutLengthPrefixedSlice(replay_log, value);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
(void)replay_log;
|
|
|
|
(void)type;
|
|
|
|
(void)value;
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
GetContext::GetContext(const Comparator* ucmp,
|
|
|
|
const MergeOperator* merge_operator, Logger* logger,
|
|
|
|
Statistics* statistics, GetState init_state,
|
|
|
|
const Slice& user_key, PinnableSlice* pinnable_val,
|
|
|
|
std::string* timestamp, bool* value_found,
|
|
|
|
MergeContext* merge_context, bool do_merge,
|
|
|
|
SequenceNumber* _max_covering_tombstone_seq,
|
|
|
|
SystemClock* clock, SequenceNumber* seq,
|
|
|
|
PinnedIteratorsManager* _pinned_iters_mgr,
|
|
|
|
ReadCallback* callback, bool* is_blob_index,
|
|
|
|
uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
|
|
|
|
: ucmp_(ucmp),
|
|
|
|
merge_operator_(merge_operator),
|
|
|
|
logger_(logger),
|
|
|
|
statistics_(statistics),
|
|
|
|
state_(init_state),
|
|
|
|
user_key_(user_key),
|
|
|
|
pinnable_val_(pinnable_val),
|
|
|
|
timestamp_(timestamp),
|
|
|
|
value_found_(value_found),
|
|
|
|
merge_context_(merge_context),
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
6 years ago
|
|
|
max_covering_tombstone_seq_(_max_covering_tombstone_seq),
|
|
|
|
clock_(clock),
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
seq_(seq),
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
9 years ago
|
|
|
replay_log_(nullptr),
|
|
|
|
pinned_iters_mgr_(_pinned_iters_mgr),
|
|
|
|
callback_(callback),
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
6 years ago
|
|
|
do_merge_(do_merge),
|
|
|
|
is_blob_index_(is_blob_index),
|
|
|
|
tracing_get_id_(tracing_get_id),
|
|
|
|
blob_fetcher_(blob_fetcher) {
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
if (seq_) {
|
|
|
|
*seq_ = kMaxSequenceNumber;
|
|
|
|
}
|
|
|
|
sample_ = should_sample_file_read();
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
}
|
|
|
|
|
|
|
|
GetContext::GetContext(
|
|
|
|
const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
|
|
|
|
Statistics* statistics, GetState init_state, const Slice& user_key,
|
|
|
|
PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
|
|
|
|
bool do_merge, SequenceNumber* _max_covering_tombstone_seq,
|
|
|
|
SystemClock* clock, SequenceNumber* seq,
|
|
|
|
PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
|
|
|
|
bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
|
|
|
|
: GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key,
|
|
|
|
pinnable_val, nullptr, value_found, merge_context, do_merge,
|
|
|
|
_max_covering_tombstone_seq, clock, seq, _pinned_iters_mgr,
|
|
|
|
callback, is_blob_index, tracing_get_id, blob_fetcher) {}
|
|
|
|
|
|
|
|
// Called from TableCache::Get and Table::Get when file/block in which
|
|
|
|
// key may exist are not there in TableCache/BlockCache respectively. In this
|
|
|
|
// case we can't guarantee that key does not exist and are not permitted to do
|
|
|
|
// IO to be certain.Set the status=kFound and value_found=false to let the
|
|
|
|
// caller know that key may exist but is not there in memory
|
|
|
|
void GetContext::MarkKeyMayExist() {
|
|
|
|
state_ = kFound;
|
|
|
|
if (value_found_ != nullptr) {
|
|
|
|
*value_found_ = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
|
|
|
|
assert(state_ == kNotFound);
|
|
|
|
appendToReplayLog(replay_log_, kTypeValue, value);
|
|
|
|
|
|
|
|
state_ = kFound;
|
|
|
|
if (LIKELY(pinnable_val_ != nullptr)) {
|
|
|
|
pinnable_val_->PinSelf(value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void GetContext::ReportCounters() {
|
|
|
|
if (get_context_stats_.num_cache_hit > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_index_hit > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
|
|
|
|
get_context_stats_.num_cache_index_hit);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_data_hit > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
|
|
|
|
get_context_stats_.num_cache_data_hit);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_filter_hit > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
|
|
|
|
get_context_stats_.num_cache_filter_hit);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_compression_dict_hit > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
|
|
|
|
get_context_stats_.num_cache_compression_dict_hit);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_index_miss > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
|
|
|
|
get_context_stats_.num_cache_index_miss);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_filter_miss > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
|
|
|
|
get_context_stats_.num_cache_filter_miss);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_data_miss > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
|
|
|
|
get_context_stats_.num_cache_data_miss);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_compression_dict_miss > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
|
|
|
|
get_context_stats_.num_cache_compression_dict_miss);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_bytes_read > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
|
|
|
|
get_context_stats_.num_cache_bytes_read);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_miss > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_MISS,
|
|
|
|
get_context_stats_.num_cache_miss);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_add > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_add_redundant > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_ADD_REDUNDANT,
|
|
|
|
get_context_stats_.num_cache_add_redundant);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_bytes_write > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
|
|
|
|
get_context_stats_.num_cache_bytes_write);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_index_add > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
|
|
|
|
get_context_stats_.num_cache_index_add);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_index_add_redundant > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD_REDUNDANT,
|
|
|
|
get_context_stats_.num_cache_index_add_redundant);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_index_bytes_insert > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
|
|
|
|
get_context_stats_.num_cache_index_bytes_insert);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_data_add > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
|
|
|
|
get_context_stats_.num_cache_data_add);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_data_add_redundant > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_DATA_ADD_REDUNDANT,
|
|
|
|
get_context_stats_.num_cache_data_add_redundant);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_data_bytes_insert > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
|
|
|
|
get_context_stats_.num_cache_data_bytes_insert);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_filter_add > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
|
|
|
|
get_context_stats_.num_cache_filter_add);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_filter_add_redundant > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD_REDUNDANT,
|
|
|
|
get_context_stats_.num_cache_filter_add_redundant);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
|
|
|
|
get_context_stats_.num_cache_filter_bytes_insert);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_compression_dict_add > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
|
|
|
|
get_context_stats_.num_cache_compression_dict_add);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_compression_dict_add_redundant > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
|
|
|
|
get_context_stats_.num_cache_compression_dict_add_redundant);
|
|
|
|
}
|
|
|
|
if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
|
|
|
|
RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
|
|
|
|
get_context_stats_.num_cache_compression_dict_bytes_insert);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
|
|
|
|
const Slice& value, bool* matched,
|
|
|
|
Cleanable* value_pinner) {
|
|
|
|
assert(matched);
|
|
|
|
assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
|
|
|
|
merge_context_ != nullptr);
|
|
|
|
if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) {
|
|
|
|
*matched = true;
|
|
|
|
// If the value is not in the snapshot, skip it
|
|
|
|
if (!CheckCallback(parsed_key.sequence)) {
|
|
|
|
return true; // to continue to the next seq
|
|
|
|
}
|
|
|
|
|
|
|
|
appendToReplayLog(replay_log_, parsed_key.type, value);
|
|
|
|
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
if (seq_ != nullptr) {
|
|
|
|
// Set the sequence number if it is uninitialized
|
|
|
|
if (*seq_ == kMaxSequenceNumber) {
|
|
|
|
*seq_ = parsed_key.sequence;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
auto type = parsed_key.type;
|
|
|
|
// Key matches. Process it
|
|
|
|
if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
6 years ago
|
|
|
max_covering_tombstone_seq_ != nullptr &&
|
|
|
|
*max_covering_tombstone_seq_ > parsed_key.sequence) {
|
|
|
|
type = kTypeRangeDeletion;
|
|
|
|
}
|
|
|
|
switch (type) {
|
|
|
|
case kTypeValue:
|
|
|
|
case kTypeBlobIndex:
|
|
|
|
assert(state_ == kNotFound || state_ == kMerge);
|
|
|
|
if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
|
|
|
|
// Blob value not supported. Stop.
|
|
|
|
state_ = kUnexpectedBlobIndex;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (is_blob_index_ != nullptr) {
|
|
|
|
*is_blob_index_ = (type == kTypeBlobIndex);
|
|
|
|
}
|
|
|
|
if (kNotFound == state_) {
|
|
|
|
state_ = kFound;
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
6 years ago
|
|
|
if (do_merge_) {
|
|
|
|
if (LIKELY(pinnable_val_ != nullptr)) {
|
|
|
|
if (LIKELY(value_pinner != nullptr)) {
|
|
|
|
// If the backing resources for the value are provided, pin them
|
|
|
|
pinnable_val_->PinSlice(value, value_pinner);
|
|
|
|
} else {
|
|
|
|
TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
|
|
|
|
this);
|
|
|
|
// Otherwise copy the value
|
|
|
|
pinnable_val_->PinSelf(value);
|
|
|
|
}
|
|
|
|
}
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
6 years ago
|
|
|
} else {
|
|
|
|
// It means this function is called as part of DB GetMergeOperands
|
|
|
|
// API and the current value should be part of
|
|
|
|
// merge_context_->operand_list
|
|
|
|
if (is_blob_index_ != nullptr && *is_blob_index_) {
|
|
|
|
PinnableSlice pin_val;
|
|
|
|
if (GetBlobValue(value, &pin_val) == false) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
Slice blob_value(pin_val);
|
|
|
|
push_operand(blob_value, nullptr);
|
|
|
|
} else {
|
|
|
|
push_operand(value, value_pinner);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (kMerge == state_) {
|
|
|
|
assert(merge_operator_ != nullptr);
|
|
|
|
if (is_blob_index_ != nullptr && *is_blob_index_) {
|
|
|
|
PinnableSlice pin_val;
|
|
|
|
if (GetBlobValue(value, &pin_val) == false) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
Slice blob_value(pin_val);
|
|
|
|
state_ = kFound;
|
|
|
|
if (do_merge_) {
|
|
|
|
Merge(&blob_value);
|
|
|
|
} else {
|
|
|
|
// It means this function is called as part of DB GetMergeOperands
|
|
|
|
// API and the current value should be part of
|
|
|
|
// merge_context_->operand_list
|
|
|
|
push_operand(blob_value, nullptr);
|
|
|
|
}
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
6 years ago
|
|
|
} else {
|
|
|
|
state_ = kFound;
|
|
|
|
if (do_merge_) {
|
|
|
|
Merge(&value);
|
|
|
|
} else {
|
|
|
|
// It means this function is called as part of DB GetMergeOperands
|
|
|
|
// API and the current value should be part of
|
|
|
|
// merge_context_->operand_list
|
|
|
|
push_operand(value, value_pinner);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (state_ == kFound) {
|
|
|
|
size_t ts_sz = ucmp_->timestamp_size();
|
|
|
|
if (ts_sz > 0 && timestamp_ != nullptr) {
|
|
|
|
Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
|
|
|
|
timestamp_->assign(ts.data(), ts.size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
|
|
|
|
case kTypeDeletion:
|
|
|
|
case kTypeDeletionWithTimestamp:
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
case kTypeSingleDeletion:
|
|
|
|
case kTypeRangeDeletion:
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
// TODO(noetzli): Verify correctness once merge of single-deletes
|
|
|
|
// is supported
|
|
|
|
assert(state_ == kNotFound || state_ == kMerge);
|
|
|
|
if (kNotFound == state_) {
|
|
|
|
state_ = kDeleted;
|
|
|
|
size_t ts_sz = ucmp_->timestamp_size();
|
|
|
|
if (ts_sz > 0 && timestamp_ != nullptr) {
|
|
|
|
Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
|
|
|
|
timestamp_->assign(ts.data(), ts.size());
|
|
|
|
}
|
|
|
|
} else if (kMerge == state_) {
|
|
|
|
state_ = kFound;
|
|
|
|
Merge(nullptr);
|
|
|
|
// If do_merge_ = false then the current value shouldn't be part of
|
|
|
|
// merge_context_->operand_list
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
|
|
|
|
case kTypeMerge:
|
|
|
|
assert(state_ == kNotFound || state_ == kMerge);
|
|
|
|
state_ = kMerge;
|
|
|
|
// value_pinner is not set from plain_table_reader.cc for example.
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
6 years ago
|
|
|
push_operand(value, value_pinner);
|
|
|
|
if (do_merge_ && merge_operator_ != nullptr &&
|
|
|
|
merge_operator_->ShouldMerge(
|
|
|
|
merge_context_->GetOperandsDirectionBackward())) {
|
|
|
|
state_ = kFound;
|
|
|
|
Merge(nullptr);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// state_ could be Corrupt, merge or notfound
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void GetContext::Merge(const Slice* value) {
|
|
|
|
if (LIKELY(pinnable_val_ != nullptr)) {
|
|
|
|
if (do_merge_) {
|
|
|
|
Status merge_status = MergeHelper::TimedFullMerge(
|
|
|
|
merge_operator_, user_key_, value, merge_context_->GetOperands(),
|
|
|
|
pinnable_val_->GetSelf(), logger_, statistics_, clock_);
|
|
|
|
pinnable_val_->PinSelf();
|
|
|
|
if (!merge_status.ok()) {
|
|
|
|
state_ = kCorrupt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool GetContext::GetBlobValue(const Slice& blob_index,
|
|
|
|
PinnableSlice* blob_value) {
|
|
|
|
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
|
|
|
|
constexpr uint64_t* bytes_read = nullptr;
|
|
|
|
|
|
|
|
Status status = blob_fetcher_->FetchBlob(
|
|
|
|
user_key_, blob_index, prefetch_buffer, blob_value, bytes_read);
|
|
|
|
if (!status.ok()) {
|
|
|
|
if (status.IsIncomplete()) {
|
|
|
|
MarkKeyMayExist();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
state_ = kCorrupt;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*is_blob_index_ = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
6 years ago
|
|
|
void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
|
|
|
|
if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
|
|
|
|
value_pinner != nullptr) {
|
|
|
|
value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
|
|
|
|
merge_context_->PushOperand(value, true /*value_pinned*/);
|
|
|
|
} else {
|
|
|
|
merge_context_->PushOperand(value, false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
|
|
|
|
GetContext* get_context, Cleanable* value_pinner) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Slice s = replay_log;
|
|
|
|
while (s.size()) {
|
|
|
|
auto type = static_cast<ValueType>(*s.data());
|
|
|
|
s.remove_prefix(1);
|
|
|
|
Slice value;
|
|
|
|
bool ret = GetLengthPrefixedSlice(&s, &value);
|
|
|
|
assert(ret);
|
|
|
|
(void)ret;
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
|
|
|
|
bool dont_care __attribute__((__unused__));
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
9 years ago
|
|
|
// Since SequenceNumber is not stored and unknown, we will use
|
|
|
|
// kMaxSequenceNumber.
|
|
|
|
get_context->SaveValue(
|
|
|
|
ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
|
|
|
|
&dont_care, value_pinner);
|
|
|
|
}
|
|
|
|
#else // ROCKSDB_LITE
|
|
|
|
(void)replay_log;
|
|
|
|
(void)user_key;
|
|
|
|
(void)get_context;
|
|
|
|
(void)value_pinner;
|
|
|
|
assert(false);
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|