New API to get all merge operands for a Key (#5604)
Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bfmain
parent
4f98b43ba3
commit
d150e01474
@ -0,0 +1,240 @@ |
|||||||
|
// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
#include "db/db_test_util.h" |
||||||
|
#include "port/stack_trace.h" |
||||||
|
#include "rocksdb/perf_context.h" |
||||||
|
#include "rocksdb/utilities/debug.h" |
||||||
|
#include "table/block_based/block_builder.h" |
||||||
|
#include "test_util/fault_injection_test_env.h" |
||||||
|
#if !defined(ROCKSDB_LITE) |
||||||
|
#include "test_util/sync_point.h" |
||||||
|
#endif |
||||||
|
#include "rocksdb/merge_operator.h" |
||||||
|
#include "utilities/merge_operators.h" |
||||||
|
#include "utilities/merge_operators/sortlist.h" |
||||||
|
#include "utilities/merge_operators/string_append/stringappend2.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class DBMergeOperandTest : public DBTestBase { |
||||||
|
public: |
||||||
|
DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {} |
||||||
|
}; |
||||||
|
|
||||||
|
TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { |
||||||
|
class LimitedStringAppendMergeOp : public StringAppendTESTOperator { |
||||||
|
public: |
||||||
|
LimitedStringAppendMergeOp(int limit, char delim) |
||||||
|
: StringAppendTESTOperator(delim), limit_(limit) {} |
||||||
|
|
||||||
|
const char* Name() const override { |
||||||
|
return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; |
||||||
|
} |
||||||
|
|
||||||
|
bool ShouldMerge(const std::vector<Slice>& operands) const override { |
||||||
|
if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { |
||||||
|
return true; |
||||||
|
} |
||||||
|
return false; |
||||||
|
} |
||||||
|
|
||||||
|
private: |
||||||
|
size_t limit_ = 0; |
||||||
|
}; |
||||||
|
|
||||||
|
Options options; |
||||||
|
options.create_if_missing = true; |
||||||
|
// Use only the latest two merge operands.
|
||||||
|
options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ','); |
||||||
|
options.env = env_; |
||||||
|
Reopen(options); |
||||||
|
int num_records = 4; |
||||||
|
int number_of_operands = 0; |
||||||
|
std::vector<PinnableSlice> values(num_records); |
||||||
|
GetMergeOperandsOptions merge_operands_info; |
||||||
|
merge_operands_info.expected_max_number_of_operands = num_records; |
||||||
|
|
||||||
|
// k0 value in memtable
|
||||||
|
Put("k0", "PutARock"); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "PutARock"); |
||||||
|
|
||||||
|
// k0.1 value in SST
|
||||||
|
Put("k0.1", "RockInSST"); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "RockInSST"); |
||||||
|
|
||||||
|
// All k1 values are in memtable.
|
||||||
|
ASSERT_OK(Merge("k1", "a")); |
||||||
|
Put("k1", "x"); |
||||||
|
ASSERT_OK(Merge("k1", "b")); |
||||||
|
ASSERT_OK(Merge("k1", "c")); |
||||||
|
ASSERT_OK(Merge("k1", "d")); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "x"); |
||||||
|
ASSERT_EQ(values[1], "b"); |
||||||
|
ASSERT_EQ(values[2], "c"); |
||||||
|
ASSERT_EQ(values[3], "d"); |
||||||
|
|
||||||
|
// expected_max_number_of_operands is less than number of merge operands so
|
||||||
|
// status should be Incomplete.
|
||||||
|
merge_operands_info.expected_max_number_of_operands = num_records - 1; |
||||||
|
Status status = db_->GetMergeOperands( |
||||||
|
ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), |
||||||
|
&merge_operands_info, &number_of_operands); |
||||||
|
ASSERT_EQ(status.IsIncomplete(), true); |
||||||
|
merge_operands_info.expected_max_number_of_operands = num_records; |
||||||
|
|
||||||
|
// All k1.1 values are in memtable.
|
||||||
|
ASSERT_OK(Merge("k1.1", "r")); |
||||||
|
Delete("k1.1"); |
||||||
|
ASSERT_OK(Merge("k1.1", "c")); |
||||||
|
ASSERT_OK(Merge("k1.1", "k")); |
||||||
|
ASSERT_OK(Merge("k1.1", "s")); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "c"); |
||||||
|
ASSERT_EQ(values[1], "k"); |
||||||
|
ASSERT_EQ(values[2], "s"); |
||||||
|
|
||||||
|
// All k2 values are flushed to L0 into a single file.
|
||||||
|
ASSERT_OK(Merge("k2", "q")); |
||||||
|
ASSERT_OK(Merge("k2", "w")); |
||||||
|
ASSERT_OK(Merge("k2", "e")); |
||||||
|
ASSERT_OK(Merge("k2", "r")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "q"); |
||||||
|
ASSERT_EQ(values[1], "w"); |
||||||
|
ASSERT_EQ(values[2], "e"); |
||||||
|
ASSERT_EQ(values[3], "r"); |
||||||
|
|
||||||
|
// All k2.1 values are flushed to L0 into a single file.
|
||||||
|
ASSERT_OK(Merge("k2.1", "m")); |
||||||
|
Put("k2.1", "l"); |
||||||
|
ASSERT_OK(Merge("k2.1", "n")); |
||||||
|
ASSERT_OK(Merge("k2.1", "o")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "l,n,o"); |
||||||
|
|
||||||
|
// All k2.2 values are flushed to L0 into a single file.
|
||||||
|
ASSERT_OK(Merge("k2.2", "g")); |
||||||
|
Delete("k2.2"); |
||||||
|
ASSERT_OK(Merge("k2.2", "o")); |
||||||
|
ASSERT_OK(Merge("k2.2", "t")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "o,t"); |
||||||
|
|
||||||
|
// Do some compaction that will make the following tests more predictable
|
||||||
|
// Slice start("PutARock");
|
||||||
|
// Slice end("t");
|
||||||
|
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); |
||||||
|
|
||||||
|
// All k3 values are flushed and are in different files.
|
||||||
|
ASSERT_OK(Merge("k3", "ab")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
ASSERT_OK(Merge("k3", "bc")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
ASSERT_OK(Merge("k3", "cd")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
ASSERT_OK(Merge("k3", "de")); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "ab"); |
||||||
|
ASSERT_EQ(values[1], "bc"); |
||||||
|
ASSERT_EQ(values[2], "cd"); |
||||||
|
ASSERT_EQ(values[3], "de"); |
||||||
|
|
||||||
|
// All k3.1 values are flushed and are in different files.
|
||||||
|
ASSERT_OK(Merge("k3.1", "ab")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
Put("k3.1", "bc"); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
ASSERT_OK(Merge("k3.1", "cd")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
ASSERT_OK(Merge("k3.1", "de")); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "bc"); |
||||||
|
ASSERT_EQ(values[1], "cd"); |
||||||
|
ASSERT_EQ(values[2], "de"); |
||||||
|
|
||||||
|
// All k3.2 values are flushed and are in different files.
|
||||||
|
ASSERT_OK(Merge("k3.2", "ab")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
Delete("k3.2"); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
ASSERT_OK(Merge("k3.2", "cd")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
ASSERT_OK(Merge("k3.2", "de")); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "cd"); |
||||||
|
ASSERT_EQ(values[1], "de"); |
||||||
|
|
||||||
|
// All K4 values are in different levels
|
||||||
|
ASSERT_OK(Merge("k4", "ba")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
MoveFilesToLevel(4); |
||||||
|
ASSERT_OK(Merge("k4", "cb")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
MoveFilesToLevel(3); |
||||||
|
ASSERT_OK(Merge("k4", "dc")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
MoveFilesToLevel(1); |
||||||
|
ASSERT_OK(Merge("k4", "ed")); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "ba"); |
||||||
|
ASSERT_EQ(values[1], "cb"); |
||||||
|
ASSERT_EQ(values[2], "dc"); |
||||||
|
ASSERT_EQ(values[3], "ed"); |
||||||
|
|
||||||
|
// First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable
|
||||||
|
ASSERT_OK(Merge("k5", "who")); |
||||||
|
ASSERT_OK(Merge("k5", "am")); |
||||||
|
ASSERT_OK(Merge("k5", "i")); |
||||||
|
ASSERT_OK(Flush()); |
||||||
|
Put("k5", "remember"); |
||||||
|
ASSERT_OK(Merge("k5", "i")); |
||||||
|
ASSERT_OK(Merge("k5", "am")); |
||||||
|
ASSERT_OK(Merge("k5", "rocks")); |
||||||
|
dbfull()->TEST_SwitchMemtable(); |
||||||
|
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5", |
||||||
|
values.data(), &merge_operands_info, |
||||||
|
&number_of_operands); |
||||||
|
ASSERT_EQ(values[0], "remember"); |
||||||
|
ASSERT_EQ(values[1], "i"); |
||||||
|
ASSERT_EQ(values[2], "am"); |
||||||
|
} |
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
int main(int argc, char** argv) { |
||||||
|
rocksdb::port::InstallStackTraceHandler(); |
||||||
|
::testing::InitGoogleTest(&argc, argv); |
||||||
|
return RUN_ALL_TESTS(); |
||||||
|
} |
@ -0,0 +1,100 @@ |
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
#include "rocksdb/merge_operator.h" |
||||||
|
#include "rocksdb/slice.h" |
||||||
|
#include "utilities/merge_operators.h" |
||||||
|
#include "utilities/merge_operators/sortlist.h" |
||||||
|
|
||||||
|
using rocksdb::Logger; |
||||||
|
using rocksdb::MergeOperator; |
||||||
|
using rocksdb::Slice; |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
bool SortList::FullMergeV2(const MergeOperationInput& merge_in, |
||||||
|
MergeOperationOutput* merge_out) const { |
||||||
|
std::vector<int> left; |
||||||
|
for (Slice slice : merge_in.operand_list) { |
||||||
|
std::vector<int> right; |
||||||
|
MakeVector(right, slice); |
||||||
|
left = Merge(left, right); |
||||||
|
} |
||||||
|
for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) { |
||||||
|
merge_out->new_value.append(std::to_string(left[i])).append(","); |
||||||
|
} |
||||||
|
merge_out->new_value.append(std::to_string(left.back())); |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand, |
||||||
|
const Slice& right_operand, std::string* new_value, |
||||||
|
Logger* /*logger*/) const { |
||||||
|
std::vector<int> left; |
||||||
|
std::vector<int> right; |
||||||
|
MakeVector(left, left_operand); |
||||||
|
MakeVector(right, right_operand); |
||||||
|
left = Merge(left, right); |
||||||
|
for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) { |
||||||
|
new_value->append(std::to_string(left[i])).append(","); |
||||||
|
} |
||||||
|
new_value->append(std::to_string(left.back())); |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
bool SortList::PartialMergeMulti(const Slice& /*key*/, |
||||||
|
const std::deque<Slice>& operand_list, |
||||||
|
std::string* new_value, |
||||||
|
Logger* /*logger*/) const { |
||||||
|
(void)operand_list; |
||||||
|
(void)new_value; |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
const char* SortList::Name() const { return "MergeSortOperator"; } |
||||||
|
|
||||||
|
void SortList::MakeVector(std::vector<int>& operand, Slice slice) const { |
||||||
|
do { |
||||||
|
const char* begin = slice.data_; |
||||||
|
while (*slice.data_ != ',' && *slice.data_) slice.data_++; |
||||||
|
operand.push_back(std::stoi(std::string(begin, slice.data_))); |
||||||
|
} while (0 != *slice.data_++); |
||||||
|
} |
||||||
|
|
||||||
|
std::vector<int> SortList::Merge(std::vector<int>& left, |
||||||
|
std::vector<int>& right) const { |
||||||
|
// Fill the resultant vector with sorted results from both vectors
|
||||||
|
std::vector<int> result; |
||||||
|
unsigned left_it = 0, right_it = 0; |
||||||
|
|
||||||
|
while (left_it < left.size() && right_it < right.size()) { |
||||||
|
// If the left value is smaller than the right it goes next
|
||||||
|
// into the resultant vector
|
||||||
|
if (left[left_it] < right[right_it]) { |
||||||
|
result.push_back(left[left_it]); |
||||||
|
left_it++; |
||||||
|
} else { |
||||||
|
result.push_back(right[right_it]); |
||||||
|
right_it++; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Push the remaining data from both vectors onto the resultant
|
||||||
|
while (left_it < left.size()) { |
||||||
|
result.push_back(left[left_it]); |
||||||
|
left_it++; |
||||||
|
} |
||||||
|
|
||||||
|
while (right_it < right.size()) { |
||||||
|
result.push_back(right[right_it]); |
||||||
|
right_it++; |
||||||
|
} |
||||||
|
|
||||||
|
return result; |
||||||
|
} |
||||||
|
|
||||||
|
std::shared_ptr<MergeOperator> MergeOperators::CreateSortOperator() { |
||||||
|
return std::make_shared<SortList>(); |
||||||
|
} |
||||||
|
} // namespace rocksdb
|
@ -0,0 +1,38 @@ |
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
// A MergeOperator for RocksDB that implements Merge Sort.
|
||||||
|
// It is built using the MergeOperator interface. The operator works by taking
|
||||||
|
// an input which contains one or more merge operands where each operand is a
|
||||||
|
// list of sorted ints and merges them to form a large sorted list.
|
||||||
|
#pragma once |
||||||
|
|
||||||
|
#include "rocksdb/merge_operator.h" |
||||||
|
#include "rocksdb/slice.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class SortList : public MergeOperator { |
||||||
|
public: |
||||||
|
bool FullMergeV2(const MergeOperationInput& merge_in, |
||||||
|
MergeOperationOutput* merge_out) const override; |
||||||
|
|
||||||
|
bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, |
||||||
|
const Slice& right_operand, std::string* new_value, |
||||||
|
Logger* /*logger*/) const override; |
||||||
|
|
||||||
|
bool PartialMergeMulti(const Slice& key, |
||||||
|
const std::deque<Slice>& operand_list, |
||||||
|
std::string* new_value, Logger* logger) const override; |
||||||
|
|
||||||
|
const char* Name() const override; |
||||||
|
|
||||||
|
void MakeVector(std::vector<int>& operand, Slice slice) const; |
||||||
|
|
||||||
|
private: |
||||||
|
std::vector<int> Merge(std::vector<int>& left, std::vector<int>& right) const; |
||||||
|
}; |
||||||
|
|
||||||
|
} // namespace rocksdb
|
Loading…
Reference in new issue