Summary: This diff introduces a new Merge operation into rocksdb. The purpose of this review is mostly getting feedback from the team (everyone please) on the design. Please focus on the four files under include/leveldb/, as they spell the client visible interface change. include/leveldb/db.h include/leveldb/merge_operator.h include/leveldb/options.h include/leveldb/write_batch.h Please go over local/my_test.cc carefully, as it is a concerete use case. Please also review the impelmentation files to see if the straw man implementation makes sense. Note that, the diff does pass all make check and truly supports forward iterator over db and a version of Get that's based on iterator. Future work: - Integration with compaction - A raw Get implementation I am working on a wiki that explains the design and implementation choices, but coding comes just naturally and I think it might be a good idea to share the code earlier. The code is heavily commented. Test Plan: run all local tests Reviewers: dhruba, heyongqiang Reviewed By: dhruba CC: leveldb, zshao, sheki, emayanke, MarkCallaghan Differential Revision: https://reviews.facebook.net/D9651main
parent
37e97b1297
commit
05e8854085
@ -0,0 +1,114 @@ |
||||
#include "merge_helper.h" |
||||
#include "db/dbformat.h" |
||||
#include "leveldb/comparator.h" |
||||
#include "leveldb/db.h" |
||||
#include "leveldb/merge_operator.h" |
||||
#include <string> |
||||
#include <stdio.h> |
||||
|
||||
namespace leveldb { |
||||
|
||||
// PRE: iter points to the first merge type entry
|
||||
// POST: iter points to the first entry beyond the merge process (or the end)
|
||||
// key_, value_ are updated to reflect the merge result
|
||||
void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, |
||||
bool at_bottom) { |
||||
// get a copy of the internal key, before it's invalidated by iter->Next()
|
||||
key_.assign(iter->key().data(), iter->key().size()); |
||||
// we need to parse the internal key again as the parsed key is
|
||||
// backed by the internal key!
|
||||
ParsedInternalKey orig_ikey; |
||||
// Assume no internal key corruption as it has been successfully parsed
|
||||
// by the caller.
|
||||
// TODO: determine a good alternative of assert (exception?)
|
||||
ParseInternalKey(key_, &orig_ikey); |
||||
std::string operand(iter->value().data(), iter->value().size()); |
||||
|
||||
bool hit_the_next_user_key = false; |
||||
ParsedInternalKey ikey; |
||||
for (iter->Next(); iter->Valid(); iter->Next()) { |
||||
if (!ParseInternalKey(iter->key(), &ikey)) { |
||||
// stop at corrupted key
|
||||
if (assert_valid_internal_key_) { |
||||
assert(!"corrupted internal key is not expected"); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) { |
||||
// hit a different user key, stop right here
|
||||
hit_the_next_user_key = true; |
||||
break; |
||||
} |
||||
|
||||
if (stop_before && ikey.sequence <= stop_before) { |
||||
// hit an entry that's visible by the previous snapshot, can't touch that
|
||||
break; |
||||
} |
||||
|
||||
if (kTypeDeletion == ikey.type) { |
||||
// hit a delete
|
||||
// => merge nullptr with operand
|
||||
// => change the entry type to kTypeValue
|
||||
// We are done!
|
||||
user_merge_operator_->Merge(ikey.user_key, nullptr, operand, |
||||
&value_, logger_); |
||||
orig_ikey.type = kTypeValue; |
||||
UpdateInternalKey(&key_[0], key_.size(), |
||||
orig_ikey.sequence, orig_ikey.type); |
||||
// move iter to the next entry
|
||||
iter->Next(); |
||||
return; |
||||
} |
||||
|
||||
if (kTypeValue == ikey.type) { |
||||
// hit a put
|
||||
// => merge the put value with operand
|
||||
// => change the entry type to kTypeValue
|
||||
// We are done!
|
||||
const Slice value = iter->value(); |
||||
user_merge_operator_->Merge(ikey.user_key, &value, Slice(operand), |
||||
&value_, logger_); |
||||
orig_ikey.type = kTypeValue; |
||||
UpdateInternalKey(&key_[0], key_.size(), |
||||
orig_ikey.sequence, orig_ikey.type); |
||||
// move iter to the next entry
|
||||
iter->Next(); |
||||
return; |
||||
} |
||||
|
||||
if (kTypeMerge == ikey.type) { |
||||
// hit a merge
|
||||
// => merge the value with operand.
|
||||
// => put the result back to operand and continue
|
||||
const Slice value = iter->value(); |
||||
user_merge_operator_->Merge(ikey.user_key, &value, operand, |
||||
&value_, logger_); |
||||
swap(value_, operand); |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
// We have seen the root history of this key if we are at the
|
||||
// bottem level and exhausted all internal keys of this user key
|
||||
// NOTE: !iter->Valid() does not necessarily mean we hit the
|
||||
// beginning of a user key, as versions of a user key might be
|
||||
// split into multiple files and some files might not be included
|
||||
// in the merge.
|
||||
bool seen_the_beginning = hit_the_next_user_key && at_bottom; |
||||
|
||||
if (seen_the_beginning) { |
||||
// do a final merge with nullptr as the existing value and say
|
||||
// bye to the merge type (it's now converted to a Put)
|
||||
assert(kTypeMerge == orig_ikey.type); |
||||
user_merge_operator_->Merge(orig_ikey.user_key, nullptr, operand, |
||||
&value_, logger_); |
||||
orig_ikey.type = kTypeValue; |
||||
UpdateInternalKey(&key_[0], key_.size(), |
||||
orig_ikey.sequence, orig_ikey.type); |
||||
} else { |
||||
swap(value_, operand); |
||||
} |
||||
} |
||||
|
||||
} // namespace leveldb
|
@ -0,0 +1,64 @@ |
||||
#ifndef MERGE_HELPER_H |
||||
#define MERGE_HELPER_H |
||||
|
||||
#include "db/dbformat.h" |
||||
#include "leveldb/slice.h" |
||||
#include <string> |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Comparator; |
||||
class Iterator; |
||||
class Logger; |
||||
class MergeOperator; |
||||
|
||||
class MergeHelper { |
||||
public: |
||||
MergeHelper(const Comparator* user_comparator, |
||||
const MergeOperator* user_merge_operator, |
||||
Logger* logger, |
||||
bool assert_valid_internal_key) |
||||
: user_comparator_(user_comparator), |
||||
user_merge_operator_(user_merge_operator), |
||||
logger_(logger), |
||||
assert_valid_internal_key_(assert_valid_internal_key) {} |
||||
|
||||
// Merge entries until we hit
|
||||
// - a corrupted key
|
||||
// - a Put/Delete,
|
||||
// - a different user key,
|
||||
// - a specific sequence number (snapshot boundary),
|
||||
// or - the end of iteration
|
||||
// iter: (IN) points to the first merge type entry
|
||||
// (OUT) points to the first entry not included in the merge process
|
||||
// stop_before: (IN) a sequence number that merge should not cross.
|
||||
// 0 means no restriction
|
||||
// at_bottom: (IN) true if the iterator covers the bottem level, which means
|
||||
// we could reach the start of the history of this user key.
|
||||
void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0, |
||||
bool at_bottom = false); |
||||
|
||||
// Query the merge result
|
||||
// These are valid until the next MergeUtil call
|
||||
// IMPORTANT: the key type could change after the MergeUntil call.
|
||||
// Put/Delete + Merge + ... + Merge => Put
|
||||
// Merge + ... + Merge => Merge
|
||||
Slice key() { return Slice(key_); } |
||||
Slice value() { return Slice(value_); } |
||||
|
||||
private: |
||||
const Comparator* user_comparator_; |
||||
const MergeOperator* user_merge_operator_; |
||||
Logger* logger_; |
||||
Iterator* iter_; // in: the internal iterator, positioned at the first merge entry
|
||||
bool assert_valid_internal_key_; // enforce no internal key corruption?
|
||||
|
||||
// the scratch area that holds the result of MergeUntil
|
||||
// valid up to the next MergeUntil call
|
||||
std::string key_; |
||||
std::string value_; |
||||
}; |
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif |
@ -0,0 +1,253 @@ |
||||
#include <assert.h> |
||||
#include <memory> |
||||
#include <iostream> |
||||
|
||||
#include "leveldb/cache.h" |
||||
#include "leveldb/comparator.h" |
||||
#include "leveldb/db.h" |
||||
#include "leveldb/env.h" |
||||
#include "leveldb/merge_operator.h" |
||||
#include "db/dbformat.h" |
||||
#include "utilities/merge_operators.h" |
||||
|
||||
using namespace std; |
||||
using namespace leveldb; |
||||
|
||||
auto mergeOperator = MergeOperators::CreateUInt64AddOperator(); |
||||
|
||||
std::shared_ptr<DB> OpenDb() { |
||||
DB* db; |
||||
Options options; |
||||
options.create_if_missing = true; |
||||
options.merge_operator = mergeOperator.get(); |
||||
Status s = DB::Open(options, "/tmp/testdb", &db); |
||||
if (!s.ok()) { |
||||
cerr << s.ToString() << endl; |
||||
assert(false); |
||||
} |
||||
return std::shared_ptr<DB>(db); |
||||
} |
||||
|
||||
// Imagine we are maintaining a set of uint64 counters.
|
||||
// Each counter has a distinct name. And we would like
|
||||
// to support four high level operations:
|
||||
// set, add, get and remove
|
||||
// This is a quick implementation without a Merge operation.
|
||||
class Counters { |
||||
|
||||
protected: |
||||
std::shared_ptr<DB> db_; |
||||
|
||||
WriteOptions put_option_; |
||||
ReadOptions get_option_; |
||||
WriteOptions delete_option_; |
||||
|
||||
uint64_t default_; |
||||
|
||||
public: |
||||
Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0) |
||||
: db_(db), |
||||
put_option_(), |
||||
get_option_(), |
||||
delete_option_(), |
||||
default_(defaultCount) { |
||||
assert(db_); |
||||
} |
||||
|
||||
virtual ~Counters() {} |
||||
|
||||
// public interface of Counters.
|
||||
// All four functions return false
|
||||
// if the underlying level db operation failed.
|
||||
|
||||
// mapped to a levedb Put
|
||||
bool set(const string& key, uint64_t value) { |
||||
// just treat the internal rep of int64 as the string
|
||||
Slice slice((char *)&value, sizeof(value)); |
||||
auto s = db_->Put(put_option_, key, slice); |
||||
|
||||
if (s.ok()) { |
||||
return true; |
||||
} else { |
||||
cerr << s.ToString() << endl; |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
// mapped to a leveldb Delete
|
||||
bool remove(const string& key) { |
||||
auto s = db_->Delete(delete_option_, key); |
||||
|
||||
if (s.ok()) { |
||||
return true; |
||||
} else { |
||||
cerr << s.ToString() << std::endl; |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
// mapped to a leveldb Get
|
||||
bool get(const string& key, uint64_t *value) { |
||||
string str; |
||||
auto s = db_->Get(get_option_, key, &str); |
||||
|
||||
if (s.IsNotFound()) { |
||||
// return default value if not found;
|
||||
*value = default_; |
||||
return true; |
||||
} else if (s.ok()) { |
||||
// deserialization
|
||||
if (str.size() != sizeof(uint64_t)) { |
||||
cerr << "value corruption\n"; |
||||
return false; |
||||
} |
||||
*value = DecodeFixed64(&str[0]); |
||||
return true; |
||||
} else { |
||||
cerr << s.ToString() << std::endl; |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
// 'add' is implemented as get -> modify -> set
|
||||
// An alternative is a single merge operation, see MergeBasedCounters
|
||||
virtual bool add(const string& key, uint64_t value) { |
||||
uint64_t base = default_; |
||||
return get(key, &base) && set(key, base + value); |
||||
} |
||||
|
||||
|
||||
// convenience functions for testing
|
||||
void assert_set(const string& key, uint64_t value) { |
||||
assert(set(key, value)); |
||||
} |
||||
|
||||
void assert_remove(const string& key) { |
||||
assert(remove(key)); |
||||
} |
||||
|
||||
uint64_t assert_get(const string& key) { |
||||
uint64_t value = default_; |
||||
assert(get(key, &value)); |
||||
return value; |
||||
} |
||||
|
||||
void assert_add(const string& key, uint64_t value) { |
||||
assert(add(key, value)); |
||||
} |
||||
}; |
||||
|
||||
// Implement 'add' directly with the new Merge operation
|
||||
class MergeBasedCounters : public Counters { |
||||
private: |
||||
WriteOptions merge_option_; // for merge
|
||||
|
||||
public: |
||||
MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0) |
||||
: Counters(db, defaultCount), |
||||
merge_option_() { |
||||
} |
||||
|
||||
// mapped to a leveldb Merge operation
|
||||
virtual bool add(const string& key, uint64_t value) override { |
||||
char encoded[sizeof(uint64_t)]; |
||||
EncodeFixed64(encoded, value); |
||||
Slice slice(encoded, sizeof(uint64_t)); |
||||
auto s = db_->Merge(merge_option_, key, slice); |
||||
|
||||
if (s.ok()) { |
||||
return true; |
||||
} else { |
||||
cerr << s.ToString() << endl; |
||||
return false; |
||||
} |
||||
} |
||||
}; |
||||
|
||||
void dumpDb(DB* db) { |
||||
auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions())); |
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) { |
||||
uint64_t value = DecodeFixed64(it->value().data()); |
||||
cout << it->key().ToString() << ": " << value << endl; |
||||
} |
||||
assert(it->status().ok()); // Check for any errors found during the scan
|
||||
} |
||||
|
||||
void testCounters(Counters& counters, DB* db, bool test_compaction) { |
||||
|
||||
FlushOptions o; |
||||
o.wait = true; |
||||
|
||||
counters.assert_set("a", 1); |
||||
|
||||
if (test_compaction) db->Flush(o); |
||||
|
||||
assert(counters.assert_get("a") == 1); |
||||
|
||||
counters.assert_remove("b"); |
||||
|
||||
// defaut value is 0 if non-existent
|
||||
assert(counters.assert_get("b") == 0); |
||||
|
||||
counters.assert_add("a", 2); |
||||
|
||||
if (test_compaction) db->Flush(o); |
||||
|
||||
// 1+2 = 3
|
||||
assert(counters.assert_get("a")== 3); |
||||
|
||||
dumpDb(db); |
||||
|
||||
std::cout << "1\n"; |
||||
|
||||
// 1+...+49 = ?
|
||||
uint64_t sum = 0; |
||||
for (int i = 1; i < 50; i++) { |
||||
counters.assert_add("b", i); |
||||
sum += i; |
||||
} |
||||
assert(counters.assert_get("b") == sum); |
||||
|
||||
std::cout << "2\n"; |
||||
dumpDb(db); |
||||
|
||||
std::cout << "3\n"; |
||||
|
||||
if (test_compaction) { |
||||
db->Flush(o); |
||||
|
||||
cout << "Compaction started ...\n"; |
||||
db->CompactRange(nullptr, nullptr); |
||||
cout << "Compaction ended\n"; |
||||
|
||||
dumpDb(db); |
||||
|
||||
assert(counters.assert_get("a")== 3); |
||||
assert(counters.assert_get("b") == sum); |
||||
} |
||||
} |
||||
|
||||
int main(int argc, char *argv[]) { |
||||
|
||||
auto db = OpenDb(); |
||||
|
||||
{ |
||||
cout << "Test read-modify-write counters... \n"; |
||||
Counters counters(db, 0); |
||||
testCounters(counters, db.get(), true); |
||||
} |
||||
|
||||
bool compact = false; |
||||
if (argc > 1) { |
||||
compact = true; |
||||
cout << "Turn on Compaction\n"; |
||||
} |
||||
|
||||
{ |
||||
cout << "Test merge-based counters... \n"; |
||||
MergeBasedCounters counters(db, 0); |
||||
testCounters(counters, db.get(), compact); |
||||
} |
||||
|
||||
return 0; |
||||
} |
@ -0,0 +1,74 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_MERGE_OPERATOR_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_MERGE_OPERATOR_H_ |
||||
|
||||
#include <string> |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Slice; |
||||
class Logger; |
||||
|
||||
// The Merge Operator interface.
|
||||
// Client needs to provide an object implementing this interface if Merge
|
||||
// operation is accessed.
|
||||
// Essentially, MergeOperator specifies the SEMANTICS of a merge, which only
|
||||
// client knows. It could be numeric addition, list append, string
|
||||
// concatenation, ... , anything.
|
||||
// The library, on the other hand, is concerned with the exercise of this
|
||||
// interface, at the right time (during get, iteration, compaction...)
|
||||
// Note that, even though in principle we don't require any special property
|
||||
// of the merge operator, the current rocksdb compaction order does imply that
|
||||
// an associative operator could be exercised more naturally (and more
|
||||
// efficiently).
|
||||
//
|
||||
// Refer to my_test.cc for an example of implementation
|
||||
//
|
||||
class MergeOperator { |
||||
public: |
||||
virtual ~MergeOperator() {} |
||||
|
||||
// Gives the client a way to express the read -> modify -> write semantics
|
||||
// key: (IN) The key that's associated with this merge operation.
|
||||
// Client could multiplex the merge operator based on it
|
||||
// if the key space is partitioned and different subspaces
|
||||
// refer to different types of data which have different
|
||||
// merge operation semantics
|
||||
// existing: (IN) null indicates that the key does not exist before this op
|
||||
// value: (IN) The passed-in merge operand value (when Merge is issued)
|
||||
// new_value:(OUT) Client is responsible for filling the merge result here
|
||||
// logger: (IN) Client could use this to log errors during merge.
|
||||
//
|
||||
// Note: Merge does not return anything to indicate if a merge is successful
|
||||
// or not.
|
||||
// Rationale: If a merge failed due to, say de-serialization error, we still
|
||||
// need to define a consistent merge result. Should we throw away
|
||||
// the existing value? the merge operand? Or reset the merged value
|
||||
// to sth? The rocksdb library is not in a position to make the
|
||||
// right choice. On the other hand, client knows exactly what
|
||||
// happened during Merge, thus is able to make the best decision.
|
||||
// Just save the final decision in new_value. logger is passed in,
|
||||
// in case client wants to leave a trace of what went wrong.
|
||||
virtual void Merge(const Slice& key, |
||||
const Slice* existing_value, |
||||
const Slice& value, |
||||
std::string* new_value, |
||||
Logger* logger) const = 0; |
||||
|
||||
|
||||
// The name of the MergeOperator. Used to check for MergeOperator
|
||||
// mismatches (i.e., a DB created with one MergeOperator is
|
||||
// accessed using a different MergeOperator)
|
||||
// TODO: the name is currently not stored persistently and thus
|
||||
// no checking is enforced. Client is responsible for providing
|
||||
// consistent MergeOperator between DB opens.
|
||||
virtual const char* Name() const = 0; |
||||
|
||||
}; |
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_MERGE_OPERATOR_H_
|
@ -0,0 +1,18 @@ |
||||
#ifndef MERGE_OPERATORS_H |
||||
#define MERGE_OPERATORS_H |
||||
|
||||
#include <memory> |
||||
|
||||
#include "leveldb/merge_operator.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class MergeOperators { |
||||
public: |
||||
static std::shared_ptr<leveldb::MergeOperator> CreatePutOperator(); |
||||
static std::shared_ptr<leveldb::MergeOperator> CreateUInt64AddOperator(); |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,35 @@ |
||||
#include <memory> |
||||
#include "leveldb/slice.h" |
||||
#include "leveldb/merge_operator.h" |
||||
#include "utilities/merge_operators.h" |
||||
|
||||
using namespace leveldb; |
||||
|
||||
namespace { // anonymous namespace
|
||||
|
||||
// A merge operator that mimics Put semantics
|
||||
class PutOperator : public MergeOperator { |
||||
public: |
||||
virtual void Merge(const Slice& key, |
||||
const Slice* existing_value, |
||||
const Slice& value, |
||||
std::string* new_value, |
||||
Logger* logger) const override { |
||||
// put basically only looks at the current value
|
||||
new_value->assign(value.data(), value.size()); |
||||
} |
||||
|
||||
virtual const char* Name() const override { |
||||
return "PutOperator"; |
||||
} |
||||
}; |
||||
|
||||
} // end of anonymous namespace
|
||||
|
||||
namespace leveldb { |
||||
|
||||
std::shared_ptr<MergeOperator> MergeOperators::CreatePutOperator() { |
||||
return std::make_shared<PutOperator>(); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,63 @@ |
||||
#include <memory> |
||||
#include "leveldb/env.h" |
||||
#include "leveldb/merge_operator.h" |
||||
#include "leveldb/slice.h" |
||||
#include "util/coding.h" |
||||
#include "utilities/merge_operators.h" |
||||
|
||||
|
||||
using namespace leveldb; |
||||
|
||||
namespace { // anonymous namespace
|
||||
|
||||
// A 'model' merge operator with uint64 addition semantics
|
||||
class UInt64AddOperator : public MergeOperator { |
||||
public: |
||||
virtual void Merge(const Slice& key, |
||||
const Slice* existing_value, |
||||
const Slice& value, |
||||
std::string* new_value, |
||||
Logger* logger) const override { |
||||
// assuming 0 if no existing value
|
||||
uint64_t existing = 0; |
||||
if (existing_value) { |
||||
if (existing_value->size() == sizeof(uint64_t)) { |
||||
existing = DecodeFixed64(existing_value->data()); |
||||
} else { |
||||
// if existing_value is corrupted, treat it as 0
|
||||
Log(logger, "existing value corruption, size: %zu > %zu", |
||||
existing_value->size(), sizeof(uint64_t)); |
||||
existing = 0; |
||||
} |
||||
} |
||||
|
||||
uint64_t operand; |
||||
if (value.size() == sizeof(uint64_t)) { |
||||
operand = DecodeFixed64(value.data()); |
||||
} else { |
||||
// if operand is corrupted, treat it as 0
|
||||
Log(logger, "operand value corruption, size: %zu > %zu", |
||||
value.size(), sizeof(uint64_t)); |
||||
operand = 0; |
||||
} |
||||
|
||||
new_value->resize(sizeof(uint64_t)); |
||||
EncodeFixed64(&(*new_value)[0], existing + operand); |
||||
|
||||
return; |
||||
} |
||||
|
||||
virtual const char* Name() const override { |
||||
return "UInt64AddOperator"; |
||||
} |
||||
}; |
||||
|
||||
} |
||||
|
||||
namespace leveldb { |
||||
|
||||
std::shared_ptr<MergeOperator> MergeOperators::CreateUInt64AddOperator() { |
||||
return std::make_shared<UInt64AddOperator>(); |
||||
} |
||||
|
||||
} |
Loading…
Reference in new issue