Optimize sequential insert into memtable - Part 1: Interface

Summary:
Currently our skip-list have an optimization to speedup sequential
inserts from a single stream, by remembering the last insert position.
We extend the idea to support sequential inserts from multiple streams,
and even tolerate small reordering wihtin each stream.

This PR is the interface part adding the following:
- Add `memtable_insert_prefix_extractor` to allow specifying prefix for each key.
- Add `InsertWithHint()` interface to memtable, to allow underlying
  implementation to return a hint of insert position, which can be later
  pass back to optimize inserts.
- Memtable will maintain a map from prefix to hints and pass the hint
  via `InsertWithHint()` if `memtable_insert_prefix_extractor` is non-null.
Closes https://github.com/facebook/rocksdb/pull/1419

Differential Revision: D4079367

Pulled By: yiwu-arbug

fbshipit-source-id: 3555326
main
Yi Wu 8 years ago committed by Facebook Github Bot
parent df5eeb85ca
commit 1ea79a78c9
  1. 1
      CMakeLists.txt
  2. 3
      HISTORY.md
  3. 4
      Makefile
  4. 5
      db/column_family.cc
  5. 160
      db/db_memtable_test.cc
  6. 18
      db/memtable.cc
  7. 8
      db/memtable.h
  8. 8
      include/rocksdb/memtablerep.h
  9. 24
      include/rocksdb/options.h
  10. 6
      memtable/skiplistrep.cc
  11. 4
      util/cf_options.cc
  12. 2
      util/cf_options.h
  13. 5
      util/hash.h
  14. 7
      util/options.cc
  15. 5
      util/options_helper.h
  16. 6
      util/options_settable_test.cc

@ -496,6 +496,7 @@ set(TESTS
db/db_inplace_update_test.cc db/db_inplace_update_test.cc
db/db_iter_test.cc db/db_iter_test.cc
db/db_log_iter_test.cc db/db_log_iter_test.cc
db/db_memtable_test.cc
db/db_options_test.cc db/db_options_test.cc
db/db_properties_test.cc db/db_properties_test.cc
db/db_table_properties_test.cc db/db_table_properties_test.cc

@ -5,7 +5,8 @@
* Suppor dynamically change `delayed_write_rate` option via SetDBOptions(). * Suppor dynamically change `delayed_write_rate` option via SetDBOptions().
### New Features ### New Features
* Add avoid_flush_during_shutdown option, which speeds up DB shutdown by not flushing unpersisted data (i.e. with disableWAL = true). Unpersisted data will be lost. The options is dynamically changeable. * Add avoid_flush_during_shutdown option, which speeds up DB shutdown by not flushing unpersisted data (i.e. with disableWAL = true). Unpersisted data will be lost. The options is dynamically changeable via SetDBOptions().
* Add memtable_insert_with_hint_prefix_extractor option. The option is mean to reduce CPU usage for inserting keys into memtable, if keys can be group by prefix and insert for each prefix are sequential or almost sequential. See include/rocksdb/options.h for more details.
## 4.13.0 (10/18/2016) ## 4.13.0 (10/18/2016)
### Public API Change ### Public API Change

@ -276,6 +276,7 @@ TESTS = \
db_flush_test \ db_flush_test \
db_inplace_update_test \ db_inplace_update_test \
db_iterator_test \ db_iterator_test \
db_memtable_test \
db_options_test \ db_options_test \
db_sst_test \ db_sst_test \
external_sst_file_test \ external_sst_file_test \
@ -955,6 +956,9 @@ db_inplace_update_test: db/db_inplace_update_test.o db/db_test_util.o $(LIBOBJEC
db_iterator_test: db/db_iterator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_iterator_test: db/db_iterator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_memtable_test: db/db_memtable_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)

@ -270,6 +270,11 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
result.max_compaction_bytes = result.target_file_size_base * 25; result.max_compaction_bytes = result.target_file_size_base * 25;
} }
// Insert into memtable with hint is incompatible with concurrent inserts.
if (db_options.allow_concurrent_memtable_write) {
result.memtable_insert_with_hint_prefix_extractor = nullptr;
}
return result; return result;
} }

@ -0,0 +1,160 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <memory>
#include <string>
#include "db/db_test_util.h"
#include "db/memtable.h"
#include "port/stack_trace.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/slice_transform.h"
namespace rocksdb {
class DBMemTableTest : public DBTestBase {
public:
DBMemTableTest() : DBTestBase("/db_memtable_test") {}
};
class MockMemTableRep : public MemTableRep {
public:
explicit MockMemTableRep(MemTableAllocator* allocator, MemTableRep* rep)
: MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
virtual KeyHandle Allocate(const size_t len, char** buf) override {
return rep_->Allocate(len, buf);
}
virtual void Insert(KeyHandle handle) override {
return rep_->Insert(handle);
}
virtual void InsertWithHint(KeyHandle handle, void** hint) override {
num_insert_with_hint_++;
ASSERT_NE(nullptr, hint);
last_hint_in_ = *hint;
rep_->InsertWithHint(handle, hint);
last_hint_out_ = *hint;
}
virtual bool Contains(const char* key) const override {
return rep_->Contains(key);
}
virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg,
const char* entry)) override {
rep_->Get(k, callback_args, callback_func);
}
virtual size_t ApproximateMemoryUsage() override {
return rep_->ApproximateMemoryUsage();
}
virtual Iterator* GetIterator(Arena* arena) override {
return rep_->GetIterator(arena);
}
void* last_hint_in() { return last_hint_in_; }
void* last_hint_out() { return last_hint_out_; }
int num_insert_with_hint() { return num_insert_with_hint_; }
private:
std::unique_ptr<MemTableRep> rep_;
void* last_hint_in_;
void* last_hint_out_;
int num_insert_with_hint_;
};
class MockMemTableRepFactory : public MemTableRepFactory {
public:
virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
MemTableAllocator* allocator,
const SliceTransform* transform,
Logger* logger) override {
SkipListFactory factory;
MemTableRep* skiplist_rep =
factory.CreateMemTableRep(cmp, allocator, transform, logger);
mock_rep_ = new MockMemTableRep(allocator, skiplist_rep);
return mock_rep_;
}
virtual const char* Name() const override { return "MockMemTableRepFactory"; }
MockMemTableRep* rep() { return mock_rep_; }
private:
MockMemTableRep* mock_rep_;
};
class TestPrefixExtractor : public SliceTransform {
public:
virtual const char* Name() const override { return "TestPrefixExtractor"; }
virtual Slice Transform(const Slice& key) const override {
const char* p = separator(key);
if (p == nullptr) {
return Slice();
}
return Slice(key.data(), p - key.data() + 1);
}
virtual bool InDomain(const Slice& key) const override {
return separator(key) != nullptr;
}
virtual bool InRange(const Slice& key) const override { return false; }
private:
const char* separator(const Slice& key) const {
return reinterpret_cast<const char*>(memchr(key.data(), '_', key.size()));
}
};
TEST_F(DBMemTableTest, InsertWithHint) {
Options options;
options.create_if_missing = true;
options.memtable_factory.reset(new MockMemTableRepFactory());
options.memtable_insert_with_hint_prefix_extractor.reset(
new TestPrefixExtractor());
Reopen(options);
MockMemTableRep* rep =
reinterpret_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
->rep();
ASSERT_OK(Put("foo_k1", "foo_v1"));
ASSERT_EQ(nullptr, rep->last_hint_in());
void* hint_foo = rep->last_hint_out();
ASSERT_OK(Put("foo_k2", "foo_v2"));
ASSERT_EQ(hint_foo, rep->last_hint_in());
ASSERT_EQ(hint_foo, rep->last_hint_out());
ASSERT_OK(Put("foo_k3", "foo_v3"));
ASSERT_EQ(hint_foo, rep->last_hint_in());
ASSERT_EQ(hint_foo, rep->last_hint_out());
ASSERT_OK(Put("bar_k1", "bar_v1"));
ASSERT_EQ(nullptr, rep->last_hint_in());
void* hint_bar = rep->last_hint_out();
ASSERT_NE(hint_foo, hint_bar);
ASSERT_OK(Put("bar_k2", "bar_v2"));
ASSERT_EQ(hint_bar, rep->last_hint_in());
ASSERT_EQ(hint_bar, rep->last_hint_out());
ASSERT_EQ(5, rep->num_insert_with_hint());
ASSERT_OK(Put("whitelisted", "vvv"));
ASSERT_EQ(5, rep->num_insert_with_hint());
ASSERT_EQ("foo_v1", Get("foo_k1"));
ASSERT_EQ("foo_v2", Get("foo_k2"));
ASSERT_EQ("foo_v3", Get("foo_k3"));
ASSERT_EQ("bar_v1", Get("bar_k1"));
ASSERT_EQ("bar_v2", Get("bar_k2"));
ASSERT_EQ("vvv", Get("whitelisted"));
}
} // namespace rocksdb
int main(int argc, char** argv) {
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

@ -87,7 +87,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
: 0), : 0),
prefix_extractor_(ioptions.prefix_extractor), prefix_extractor_(ioptions.prefix_extractor),
flush_state_(FLUSH_NOT_REQUESTED), flush_state_(FLUSH_NOT_REQUESTED),
env_(ioptions.env) { env_(ioptions.env),
insert_with_hint_prefix_extractor_(
ioptions.memtable_insert_with_hint_prefix_extractor) {
UpdateFlushState(); UpdateFlushState();
// something went wrong if we need to flush before inserting anything // something went wrong if we need to flush before inserting anything
assert(!ShouldScheduleFlush()); assert(!ShouldScheduleFlush());
@ -423,6 +425,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
char* p = EncodeVarint32(buf, internal_key_size); char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size); memcpy(p, key.data(), key_size);
Slice key_slice(p, key_size);
p += key_size; p += key_size;
uint64_t packed = PackSequenceAndType(s, type); uint64_t packed = PackSequenceAndType(s, type);
EncodeFixed64(p, packed); EncodeFixed64(p, packed);
@ -431,7 +434,18 @@ void MemTable::Add(SequenceNumber s, ValueType type,
memcpy(p, value.data(), val_size); memcpy(p, value.data(), val_size);
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
if (!allow_concurrent) { if (!allow_concurrent) {
table->Insert(handle); // Extract prefix for insert with hint.
Slice prefix;
if (insert_with_hint_prefix_extractor_ != nullptr) {
if (insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
}
}
if (prefix.empty()) {
table->Insert(handle);
} else {
table->InsertWithHint(handle, &insert_hints_[prefix]);
}
// this is a bit ugly, but is the way to avoid locked instructions // this is a bit ugly, but is the way to avoid locked instructions
// when incrementing an atomic // when incrementing an atomic

@ -13,6 +13,7 @@
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/memtable_allocator.h" #include "db/memtable_allocator.h"
@ -25,6 +26,7 @@
#include "util/cf_options.h" #include "util/cf_options.h"
#include "util/concurrent_arena.h" #include "util/concurrent_arena.h"
#include "util/dynamic_bloom.h" #include "util/dynamic_bloom.h"
#include "util/hash.h"
#include "util/instrumented_mutex.h" #include "util/instrumented_mutex.h"
namespace rocksdb { namespace rocksdb {
@ -390,6 +392,12 @@ class MemTable {
Env* env_; Env* env_;
// Extract sequential insert prefixes.
const SliceTransform* insert_with_hint_prefix_extractor_;
// Insert hints for each prefix.
std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
// Returns a heuristic flush decision // Returns a heuristic flush decision
bool ShouldFlushNow() const; bool ShouldFlushNow() const;

@ -83,6 +83,14 @@ class MemTableRep {
// collection, and no concurrent modifications to the table in progress // collection, and no concurrent modifications to the table in progress
virtual void Insert(KeyHandle handle) = 0; virtual void Insert(KeyHandle handle) = 0;
// Same as Insert(), but in additional pass a hint to optimize sequential
// inserts. A new hint will be return from the hint pointer. Caller can get
// an initial hint by passing hint pointing to nullptr.
virtual void InsertWithHint(KeyHandle handle, void** hint) {
// Ignore the hint by default.
Insert(handle);
}
// Like Insert(handle), but may be called concurrent with other calls // Like Insert(handle), but may be called concurrent with other calls
// to InsertConcurrently for other handles // to InsertConcurrently for other handles
virtual void InsertConcurrently(KeyHandle handle) { virtual void InsertConcurrently(KeyHandle handle) {

@ -746,6 +746,30 @@ struct ColumnFamilyOptions {
// Dynamically changeable through SetOptions() API // Dynamically changeable through SetOptions() API
size_t memtable_huge_page_size; size_t memtable_huge_page_size;
// If non-nullptr, memtable will use the specified function to extract
// prefixes for keys, and for each non-empty prefix maintain a hint to
// reduce CPU usage for inserting keys with the prefix. Keys with empty
// prefix will be insert without using a hint.
//
// Currently only the default skiplist based memtable implements the feature.
// All other memtable implementation will ignore the option. It incurs ~150
// additional bytes of memory overhead to store a hint for each prefix.
// If allow_concurrent_memtable_write is true, the option will also be
// ignored.
//
// The option is best suited for sequential inserts, or inserts that's
// almost sequential. One scenario could be inserting keys of the form
// (prefix + timestamp), and keys of the same prefix always comes in
// with time order, or in some cases a key with a smaller timestamp comes
// in later due to network latency.
//
// REQUIRES: If custom comparator is provided, it has to make sure keys
// with the same prefix appear in consecutive range.
//
// Default: nullptr (disable)
std::shared_ptr<const SliceTransform>
memtable_insert_with_hint_prefix_extractor;
// Control locality of bloom filter probes to improve cache miss rate. // Control locality of bloom filter probes to improve cache miss rate.
// This option only applies to memtable prefix bloom and plaintable // This option only applies to memtable prefix bloom and plaintable
// prefix bloom. It essentially limits every bloom checking to one cache line. // prefix bloom. It essentially limits every bloom checking to one cache line.

@ -36,6 +36,12 @@ public:
skip_list_.Insert(static_cast<char*>(handle)); skip_list_.Insert(static_cast<char*>(handle));
} }
virtual void InsertWithHint(KeyHandle handle, void** hint) override {
skip_list_.InsertWithHint(
static_cast<char*>(handle),
reinterpret_cast<decltype(skip_list_)::InsertHint**>(hint));
}
virtual void InsertConcurrently(KeyHandle handle) override { virtual void InsertConcurrently(KeyHandle handle) override {
skip_list_.InsertConcurrently(static_cast<char*>(handle)); skip_list_.InsertConcurrently(static_cast<char*>(handle));
} }

@ -71,7 +71,9 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
force_consistency_checks(cf_options.force_consistency_checks), force_consistency_checks(cf_options.force_consistency_checks),
listeners(db_options.listeners), listeners(db_options.listeners),
row_cache(db_options.row_cache), row_cache(db_options.row_cache),
max_subcompactions(db_options.max_subcompactions) {} max_subcompactions(db_options.max_subcompactions),
memtable_insert_with_hint_prefix_extractor(
cf_options.memtable_insert_with_hint_prefix_extractor.get()) {}
// Multiple two operands. If they overflow, return op1. // Multiple two operands. If they overflow, return op1.
uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) { uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {

@ -115,6 +115,8 @@ struct ImmutableCFOptions {
std::shared_ptr<Cache> row_cache; std::shared_ptr<Cache> row_cache;
uint32_t max_subcompactions; uint32_t max_subcompactions;
const SliceTransform* memtable_insert_with_hint_prefix_extractor;
}; };
struct MutableCFOptions { struct MutableCFOptions {

@ -27,4 +27,9 @@ inline uint32_t GetSliceHash(const Slice& s) {
return Hash(s.data(), s.size(), 397); return Hash(s.data(), s.size(), 397);
} }
// std::hash compatible interface.
struct SliceHasher {
uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
};
} // namespace rocksdb } // namespace rocksdb

@ -78,6 +78,7 @@ ColumnFamilyOptions::ColumnFamilyOptions()
inplace_callback(nullptr), inplace_callback(nullptr),
memtable_prefix_bloom_size_ratio(0.0), memtable_prefix_bloom_size_ratio(0.0),
memtable_huge_page_size(0), memtable_huge_page_size(0),
memtable_insert_with_hint_prefix_extractor(nullptr),
bloom_locality(0), bloom_locality(0),
max_successive_merges(0), max_successive_merges(0),
min_partial_merge_operands(2), min_partial_merge_operands(2),
@ -145,6 +146,8 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
memtable_prefix_bloom_size_ratio( memtable_prefix_bloom_size_ratio(
options.memtable_prefix_bloom_size_ratio), options.memtable_prefix_bloom_size_ratio),
memtable_huge_page_size(options.memtable_huge_page_size), memtable_huge_page_size(options.memtable_huge_page_size),
memtable_insert_with_hint_prefix_extractor(
options.memtable_insert_with_hint_prefix_extractor),
bloom_locality(options.bloom_locality), bloom_locality(options.bloom_locality),
max_successive_merges(options.max_successive_merges), max_successive_merges(options.max_successive_merges),
min_partial_merge_operands(options.min_partial_merge_operands), min_partial_merge_operands(options.min_partial_merge_operands),
@ -463,6 +466,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
: CompressionTypeToString(bottommost_compression).c_str()); : CompressionTypeToString(bottommost_compression).c_str());
Header(log, " Options.prefix_extractor: %s", Header(log, " Options.prefix_extractor: %s",
prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
Header(log, " Options.memtable_insert_with_hint_prefix_extractor: %s",
memtable_insert_with_hint_prefix_extractor == nullptr
? "nullptr"
: memtable_insert_with_hint_prefix_extractor->Name());
Header(log, " Options.num_levels: %d", num_levels); Header(log, " Options.num_levels: %d", num_levels);
Header(log, " Options.min_write_buffer_number_to_merge: %d", Header(log, " Options.min_write_buffer_number_to_merge: %d",
min_write_buffer_number_to_merge); min_write_buffer_number_to_merge);

@ -531,6 +531,11 @@ static std::unordered_map<std::string, OptionTypeInfo> cf_options_type_info = {
{offsetof(struct ColumnFamilyOptions, prefix_extractor), {offsetof(struct ColumnFamilyOptions, prefix_extractor),
OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull, OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
false, 0}}, false, 0}},
{"memtable_insert_with_hint_prefix_extractor",
{offsetof(struct ColumnFamilyOptions,
memtable_insert_with_hint_prefix_extractor),
OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
false, 0}},
{"memtable_factory", {"memtable_factory",
{offsetof(struct ColumnFamilyOptions, memtable_factory), {offsetof(struct ColumnFamilyOptions, memtable_factory),
OptionType::kMemTableRepFactory, OptionVerificationType::kByName, false, OptionType::kMemTableRepFactory, OptionVerificationType::kByName, false,

@ -312,6 +312,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
// kColumnFamilyOptionsBlacklist, and maybe add customized verification // kColumnFamilyOptionsBlacklist, and maybe add customized verification
// for it. // for it.
TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
// options in the blacklist need to appear in the same order as in
// ColumnFamilyOptions.
const OffsetGap kColumnFamilyOptionsBlacklist = { const OffsetGap kColumnFamilyOptionsBlacklist = {
{offsetof(struct ColumnFamilyOptions, comparator), sizeof(Comparator*)}, {offsetof(struct ColumnFamilyOptions, comparator), sizeof(Comparator*)},
{offsetof(struct ColumnFamilyOptions, merge_operator), {offsetof(struct ColumnFamilyOptions, merge_operator),
@ -336,6 +338,9 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
sizeof(ColumnFamilyOptions::TablePropertiesCollectorFactories)}, sizeof(ColumnFamilyOptions::TablePropertiesCollectorFactories)},
{offsetof(struct ColumnFamilyOptions, inplace_callback), {offsetof(struct ColumnFamilyOptions, inplace_callback),
sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))}, sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))},
{offsetof(struct ColumnFamilyOptions,
memtable_insert_with_hint_prefix_extractor),
sizeof(std::shared_ptr<const SliceTransform>)},
}; };
char* options_ptr = new char[sizeof(ColumnFamilyOptions)]; char* options_ptr = new char[sizeof(ColumnFamilyOptions)];
@ -419,6 +424,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"verify_checksums_in_compaction=false;" "verify_checksums_in_compaction=false;"
"merge_operator=aabcxehazrMergeOperator;" "merge_operator=aabcxehazrMergeOperator;"
"memtable_prefix_bloom_size_ratio=0.4642;" "memtable_prefix_bloom_size_ratio=0.4642;"
"memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;"
"paranoid_file_checks=true;" "paranoid_file_checks=true;"
"force_consistency_checks=true;" "force_consistency_checks=true;"
"inplace_update_num_locks=7429;" "inplace_update_num_locks=7429;"

Loading…
Cancel
Save