Allow allocating dynamic bloom, plain table indexes and hash linked list from huge page TLB

Summary: Add an option to allocate a piece of memory from huge page TLB. Add options to trigger it in dynamic bloom, plain table indexes andhash linked list hash table.

Test Plan: make all check

Reviewers: haobo, ljin

Reviewed By: haobo

CC: nkg-, dhruba, leveldb, igor, yhchiang

Differential Revision: https://reviews.facebook.net/D18357
main
sdong 11 years ago
parent 66f88c43a5
commit 7dafa3a1d7
  1. 2
      db/db_test.cc
  2. 7
      db/memtable.cc
  3. 631
      db/plain_table_db_test.cc
  4. 8
      db/prefix_test.cc
  5. 9
      include/rocksdb/memtablerep.h
  6. 8
      include/rocksdb/options.h
  7. 17
      include/rocksdb/table.h
  8. 14
      table/plain_table_factory.cc
  9. 10
      table/plain_table_factory.h
  10. 41
      table/plain_table_reader.cc
  11. 12
      table/plain_table_reader.h
  12. 30
      util/arena.cc
  13. 17
      util/arena.h
  14. 27
      util/dynamic_bloom.cc
  15. 18
      util/dynamic_bloom.h
  16. 25
      util/hash_linklist_rep.cc
  17. 6
      util/hash_linklist_rep.h
  18. 8
      util/options.cc

@ -480,7 +480,7 @@ class DBTest {
break; break;
case kHashLinkList: case kHashLinkList:
options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.memtable_factory.reset(NewHashLinkListRepFactory(4)); options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
break; break;
case kHashCuckoo: case kHashCuckoo:
options.memtable_factory.reset( options.memtable_factory.reset(

@ -52,9 +52,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
// gone wrong already. // gone wrong already.
assert(!should_flush_); assert(!should_flush_);
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits, prefix_bloom_.reset(new DynamicBloom(
options.bloom_locality, options.memtable_prefix_bloom_bits, options.bloom_locality,
options.memtable_prefix_bloom_probes)); options.memtable_prefix_bloom_probes, nullptr,
options.memtable_prefix_bloom_huge_page_tlb_size));
} }
} }

@ -185,7 +185,7 @@ class TestPlainTableReader : public PlainTableReader {
const Options& options, bool* expect_bloom_not_match) const Options& options, bool* expect_bloom_not_match)
: PlainTableReader(options, std::move(file), storage_options, icomparator, : PlainTableReader(options, std::move(file), storage_options, icomparator,
file_size, bloom_bits_per_key, hash_table_ratio, file_size, bloom_bits_per_key, hash_table_ratio,
index_sparseness, table_properties), index_sparseness, table_properties, 2 * 1024 * 1024),
expect_bloom_not_match_(expect_bloom_not_match) { expect_bloom_not_match_(expect_bloom_not_match) {
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties)); Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
ASSERT_TRUE(s.ok()); ASSERT_TRUE(s.ok());
@ -206,13 +206,12 @@ extern const uint64_t kPlainTableMagicNumber;
class TestPlainTableFactory : public PlainTableFactory { class TestPlainTableFactory : public PlainTableFactory {
public: public:
explicit TestPlainTableFactory(bool* expect_bloom_not_match, explicit TestPlainTableFactory(bool* expect_bloom_not_match,
uint32_t user_key_len = uint32_t user_key_len, int bloom_bits_per_key,
kPlainTableVariableLength, double hash_table_ratio,
int bloom_bits_per_key = 0, size_t index_sparseness,
double hash_table_ratio = 0.75, size_t huge_page_tlb_size)
size_t index_sparseness = 16)
: PlainTableFactory(user_key_len, user_key_len, hash_table_ratio, : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
hash_table_ratio), index_sparseness, huge_page_tlb_size),
bloom_bits_per_key_(bloom_bits_per_key), bloom_bits_per_key_(bloom_bits_per_key),
hash_table_ratio_(hash_table_ratio), hash_table_ratio_(hash_table_ratio),
index_sparseness_(index_sparseness), index_sparseness_(index_sparseness),
@ -244,197 +243,209 @@ class TestPlainTableFactory : public PlainTableFactory {
}; };
TEST(PlainTableDBTest, Flush) { TEST(PlainTableDBTest, Flush) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
for (int total_order = 0; total_order <= 1; total_order++) { huge_page_tlb_size += 2 * 1024 * 1024) {
Options options = CurrentOptions(); for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
options.create_if_missing = true; for (int total_order = 0; total_order <= 1; total_order++) {
// Set only one bucket to force bucket conflict. Options options = CurrentOptions();
// Test index interval for the same prefix to be 1, 2 and 4 options.create_if_missing = true;
if (total_order) { // Set only one bucket to force bucket conflict.
options.table_factory.reset( // Test index interval for the same prefix to be 1, 2 and 4
NewTotalOrderPlainTableFactory(16, bloom_bits, 2)); if (total_order) {
} else { options.table_factory.reset(NewTotalOrderPlainTableFactory(
options.table_factory.reset(NewPlainTableFactory(16, bloom_bits)); 16, bloom_bits, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(NewPlainTableFactory(
16, bloom_bits, 0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
TablePropertiesCollection ptc;
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
ASSERT_EQ(1U, ptc.size());
auto row = ptc.begin();
auto tp = row->second;
ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
"plain_table_hash_table_size"));
ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
"plain_table_sub_index_size"));
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
} }
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
TablePropertiesCollection ptc;
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
ASSERT_EQ(1U, ptc.size());
auto row = ptc.begin();
auto tp = row->second;
ASSERT_EQ(
total_order ? "4" : "12",
(tp->user_collected_properties).at("plain_table_hash_table_size"));
ASSERT_EQ(
total_order ? "9" : "0",
(tp->user_collected_properties).at("plain_table_sub_index_size"));
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
} }
} }
} }
TEST(PlainTableDBTest, Flush2) { TEST(PlainTableDBTest, Flush2) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
for (int total_order = 0; total_order <= 1; total_order++) { huge_page_tlb_size += 2 * 1024 * 1024) {
bool expect_bloom_not_match = false; for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
Options options = CurrentOptions(); for (int total_order = 0; total_order <= 1; total_order++) {
options.create_if_missing = true; bool expect_bloom_not_match = false;
// Set only one bucket to force bucket conflict. Options options = CurrentOptions();
// Test index interval for the same prefix to be 1, 2 and 4 options.create_if_missing = true;
if (total_order) { // Set only one bucket to force bucket conflict.
options.prefix_extractor = nullptr; // Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(new TestPlainTableFactory(
&expect_bloom_not_match, 16, bloom_bits, 0, 2));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("1000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("1000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("1000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
ASSERT_OK(Put("9000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
// Test Bloom Filter
if (bloom_bits > 0) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
// Key doesn't exist any more but prefix exists.
if (total_order) { if (total_order) {
ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); options.prefix_extractor = nullptr;
ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("1000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("1000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("1000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
ASSERT_OK(Put("9000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
// Test Bloom Filter
if (bloom_bits > 0) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
// Key doesn't exist any more but prefix exists.
if (total_order) {
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
}
expect_bloom_not_match = false;
} }
expect_bloom_not_match = false;
} }
} }
} }
} }
TEST(PlainTableDBTest, Iterator) { TEST(PlainTableDBTest, Iterator) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
for (int total_order = 0; total_order <= 1; total_order++) { huge_page_tlb_size += 2 * 1024 * 1024) {
bool expect_bloom_not_match = false; for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
Options options = CurrentOptions(); for (int total_order = 0; total_order <= 1; total_order++) {
options.create_if_missing = true; bool expect_bloom_not_match = false;
// Set only one bucket to force bucket conflict. Options options = CurrentOptions();
// Test index interval for the same prefix to be 1, 2 and 4 options.create_if_missing = true;
if (total_order) { // Set only one bucket to force bucket conflict.
options.prefix_extractor = nullptr; // Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(new TestPlainTableFactory( if (total_order) {
&expect_bloom_not_match, 16, bloom_bits, 0, 2)); options.prefix_extractor = nullptr;
} else { options.table_factory.reset(
options.table_factory.reset( new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); 0, 2, huge_page_tlb_size));
} } else {
DestroyAndReopen(&options); options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
ASSERT_OK(Put("1000000000foo002", "v_2")); 0.75, 16, huge_page_tlb_size));
ASSERT_OK(Put("0000000000000bar", "random")); }
ASSERT_OK(Put("1000000000foo001", "v1")); DestroyAndReopen(&options);
ASSERT_OK(Put("3000000000000bar", "bar_v"));
ASSERT_OK(Put("1000000000foo003", "v__3")); ASSERT_OK(Put("1000000000foo002", "v_2"));
ASSERT_OK(Put("1000000000foo004", "v__4")); ASSERT_OK(Put("0000000000000bar", "random"));
ASSERT_OK(Put("1000000000foo005", "v__5")); ASSERT_OK(Put("1000000000foo001", "v1"));
ASSERT_OK(Put("1000000000foo007", "v__7")); ASSERT_OK(Put("3000000000000bar", "bar_v"));
ASSERT_OK(Put("1000000000foo008", "v__8")); ASSERT_OK(Put("1000000000foo003", "v__3"));
dbfull()->TEST_FlushMemTable(); ASSERT_OK(Put("1000000000foo004", "v__4"));
ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_OK(Put("1000000000foo005", "v__5"));
ASSERT_EQ("v__3", Get("1000000000foo003")); ASSERT_OK(Put("1000000000foo007", "v__7"));
Iterator* iter = dbfull()->NewIterator(ReadOptions()); ASSERT_OK(Put("1000000000foo008", "v__8"));
iter->Seek("1000000000foo000"); dbfull()->TEST_FlushMemTable();
ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v1", Get("1000000000foo001"));
ASSERT_EQ("1000000000foo001", iter->key().ToString()); ASSERT_EQ("v__3", Get("1000000000foo003"));
ASSERT_EQ("v1", iter->value().ToString()); Iterator* iter = dbfull()->NewIterator(ReadOptions());
iter->Seek("1000000000foo000");
iter->Next(); ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(iter->Valid()); ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("1000000000foo002", iter->key().ToString()); ASSERT_EQ("v1", iter->value().ToString());
ASSERT_EQ("v_2", iter->value().ToString());
iter->Next(); iter->Next();
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo003", iter->key().ToString()); ASSERT_EQ("1000000000foo002", iter->key().ToString());
ASSERT_EQ("v__3", iter->value().ToString()); ASSERT_EQ("v_2", iter->value().ToString());
iter->Next(); iter->Next();
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo004", iter->key().ToString()); ASSERT_EQ("1000000000foo003", iter->key().ToString());
ASSERT_EQ("v__4", iter->value().ToString()); ASSERT_EQ("v__3", iter->value().ToString());
iter->Seek("3000000000000bar"); iter->Next();
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString()); ASSERT_EQ("1000000000foo004", iter->key().ToString());
ASSERT_EQ("bar_v", iter->value().ToString()); ASSERT_EQ("v__4", iter->value().ToString());
iter->Seek("1000000000foo000"); iter->Seek("3000000000000bar");
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString()); ASSERT_EQ("3000000000000bar", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString()); ASSERT_EQ("bar_v", iter->value().ToString());
iter->Seek("1000000000foo005"); iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString()); ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString()); ASSERT_EQ("v1", iter->value().ToString());
iter->Seek("1000000000foo006"); iter->Seek("1000000000foo005");
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo007", iter->key().ToString()); ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__7", iter->value().ToString()); ASSERT_EQ("v__5", iter->value().ToString());
iter->Seek("1000000000foo008"); iter->Seek("1000000000foo006");
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo008", iter->key().ToString()); ASSERT_EQ("1000000000foo007", iter->key().ToString());
ASSERT_EQ("v__8", iter->value().ToString()); ASSERT_EQ("v__7", iter->value().ToString());
if (total_order == 0) { iter->Seek("1000000000foo008");
iter->Seek("1000000000foo009");
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString()); ASSERT_EQ("1000000000foo008", iter->key().ToString());
} ASSERT_EQ("v__8", iter->value().ToString());
// Test Bloom Filter if (total_order == 0) {
if (bloom_bits > 0) { iter->Seek("1000000000foo009");
if (!total_order) { ASSERT_TRUE(iter->Valid());
// Neither key nor value should exist. ASSERT_EQ("3000000000000bar", iter->key().ToString());
expect_bloom_not_match = true;
iter->Seek("2not000000000bar");
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
} else {
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
} }
}
delete iter; // Test Bloom Filter
if (bloom_bits > 0) {
if (!total_order) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
iter->Seek("2not000000000bar");
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
} else {
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
}
}
delete iter;
}
} }
} }
} }
@ -581,165 +592,173 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
} }
TEST(PlainTableDBTest, HashBucketConflict) { TEST(PlainTableDBTest, HashBucketConflict) {
for (unsigned char i = 1; i <= 3; i++) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
Options options = CurrentOptions(); huge_page_tlb_size += 2 * 1024 * 1024) {
options.create_if_missing = true; for (unsigned char i = 1; i <= 3; i++) {
// Set only one bucket to force bucket conflict. Options options = CurrentOptions();
// Test index interval for the same prefix to be 1, 2 and 4 options.create_if_missing = true;
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); // Set only one bucket to force bucket conflict.
DestroyAndReopen(&options); // Test index interval for the same prefix to be 1, 2 and 4
ASSERT_OK(Put("5000000000000fo0", "v1")); options.table_factory.reset(
ASSERT_OK(Put("5000000000000fo1", "v2")); NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
ASSERT_OK(Put("5000000000000fo2", "v")); DestroyAndReopen(&options);
ASSERT_OK(Put("2000000000000fo0", "v3")); ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("2000000000000fo1", "v4")); ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("2000000000000fo2", "v")); ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v")); ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
dbfull()->TEST_FlushMemTable(); ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo0");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("5000000000000fo1"); dbfull()->TEST_FlushMemTable();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo0"); ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("2000000000000fo0", iter->key().ToString()); ASSERT_EQ("v3", Get("2000000000000fo0"));
iter->Next(); ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1"); ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_TRUE(iter->Valid()); ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("2000000000000fo1", iter->key().ToString()); ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
iter->Seek("2000000000000bar"); ReadOptions ro;
ASSERT_TRUE(iter->Valid()); Iterator* iter = dbfull()->NewIterator(ro);
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000bar"); iter->Seek("5000000000000fo0");
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString()); ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo8"); iter->Seek("2000000000000fo0");
ASSERT_TRUE(!iter->Valid() || ASSERT_TRUE(iter->Valid());
options.comparator->Compare(iter->key(), "20000001") > 0); ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000fo8"); iter->Seek("2000000000000fo8");
ASSERT_TRUE(!iter->Valid()); ASSERT_TRUE(!iter->Valid() ||
options.comparator->Compare(iter->key(), "20000001") > 0);
iter->Seek("1000000000000fo2"); iter->Seek("5000000000000fo8");
ASSERT_TRUE(!iter->Valid()); ASSERT_TRUE(!iter->Valid());
iter->Seek("3000000000000fo2"); iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid()); ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2"); iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid()); ASSERT_TRUE(!iter->Valid());
delete iter; iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
} }
} }
TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
for (unsigned char i = 1; i <= 3; i++) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
Options options = CurrentOptions(); huge_page_tlb_size += 2 * 1024 * 1024) {
options.create_if_missing = true; for (unsigned char i = 1; i <= 3; i++) {
SimpleSuffixReverseComparator comp; Options options = CurrentOptions();
options.comparator = &comp; options.create_if_missing = true;
// Set only one bucket to force bucket conflict. SimpleSuffixReverseComparator comp;
// Test index interval for the same prefix to be 1, 2 and 4 options.comparator = &comp;
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); // Set only one bucket to force bucket conflict.
DestroyAndReopen(&options); // Test index interval for the same prefix to be 1, 2 and 4
ASSERT_OK(Put("5000000000000fo0", "v1")); options.table_factory.reset(
ASSERT_OK(Put("5000000000000fo1", "v2")); NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
ASSERT_OK(Put("5000000000000fo2", "v")); DestroyAndReopen(&options);
ASSERT_OK(Put("2000000000000fo0", "v3")); ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("2000000000000fo1", "v4")); ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("2000000000000fo2", "v")); ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v")); ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
dbfull()->TEST_FlushMemTable(); ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000fo1"); dbfull()->TEST_FlushMemTable();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1"); ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("2000000000000fo1", iter->key().ToString()); ASSERT_EQ("v3", Get("2000000000000fo0"));
iter->Next(); ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("2000000000000fo1"); ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_TRUE(iter->Valid()); ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("2000000000000fo1", iter->key().ToString()); ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
iter->Seek("2000000000000var"); ReadOptions ro;
ASSERT_TRUE(iter->Valid()); Iterator* iter = dbfull()->NewIterator(ro);
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
iter->Seek("5000000000000var"); iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo2", iter->key().ToString()); ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
std::string seek_key = "2000000000000bar"; iter->Seek("2000000000000fo1");
iter->Seek(seek_key); ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(!iter->Valid() || ASSERT_EQ("2000000000000fo1", iter->key().ToString());
options.prefix_extractor->Transform(iter->key()) !=
options.prefix_extractor->Transform(seek_key));
iter->Seek("1000000000000fo2"); iter->Seek("2000000000000var");
ASSERT_TRUE(!iter->Valid()); ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
iter->Seek("5000000000000var");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo2", iter->key().ToString());
iter->Seek("3000000000000fo2"); std::string seek_key = "2000000000000bar";
ASSERT_TRUE(!iter->Valid()); iter->Seek(seek_key);
ASSERT_TRUE(!iter->Valid() ||
options.prefix_extractor->Transform(iter->key()) !=
options.prefix_extractor->Transform(seek_key));
iter->Seek("8000000000000fo2"); iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid()); ASSERT_TRUE(!iter->Valid());
delete iter; iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
} }
} }

@ -30,6 +30,7 @@ DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
DEFINE_int32(skiplist_height, 4, ""); DEFINE_int32(skiplist_height, 4, "");
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, ""); DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
DEFINE_int32(memtable_prefix_bloom_probes, 10, ""); DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
DEFINE_int32(value_size, 40, ""); DEFINE_int32(value_size, 40, "");
// Path to the database on file system // Path to the database on file system
@ -148,6 +149,8 @@ class PrefixTest {
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits; options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes; options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
options.memtable_prefix_bloom_huge_page_tlb_size =
FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
Status s = DB::Open(options, kDbName, &db); Status s = DB::Open(options, kDbName, &db);
ASSERT_OK(s); ASSERT_OK(s);
@ -172,6 +175,10 @@ class PrefixTest {
options.memtable_factory.reset( options.memtable_factory.reset(
NewHashLinkListRepFactory(bucket_count)); NewHashLinkListRepFactory(bucket_count));
return true; return true;
case kHashLinkListHugePageTlb:
options.memtable_factory.reset(
NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
return true;
default: default:
return false; return false;
} }
@ -190,6 +197,7 @@ class PrefixTest {
kBegin, kBegin,
kHashSkipList, kHashSkipList,
kHashLinkList, kHashLinkList,
kHashLinkListHugePageTlb,
kEnd kEnd
}; };
int option_config_; int option_config_;

@ -223,9 +223,14 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
// The factory is to create memtables with a hashed linked list: // The factory is to create memtables with a hashed linked list:
// it contains a fixed array of buckets, each pointing to a sorted single // it contains a fixed array of buckets, each pointing to a sorted single
// linked list (null if the bucket is empty). // linked list (null if the bucket is empty).
// bucket_count: number of fixed array buckets // @bucket_count: number of fixed array buckets
// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
extern MemTableRepFactory* NewHashLinkListRepFactory( extern MemTableRepFactory* NewHashLinkListRepFactory(
size_t bucket_count = 50000); size_t bucket_count = 50000, size_t huge_page_tlb_size = 2 * 1024 * 1024);
// This factory creates a cuckoo-hashing based mem-table representation. // This factory creates a cuckoo-hashing based mem-table representation.
// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs // Cuckoo-hash is a closed-hash strategy, in which all key/value pairs

@ -497,6 +497,14 @@ struct ColumnFamilyOptions {
// number of hash probes per key // number of hash probes per key
uint32_t memtable_prefix_bloom_probes; uint32_t memtable_prefix_bloom_probes;
// Page size for huge page TLB for bloom in memtable. If <=0, not allocate
// from huge page TLB but from malloc.
// Need to reserve huge pages for it to be allocated. For example:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
size_t memtable_prefix_bloom_huge_page_tlb_size;
// Control locality of bloom filter probes to improve cache miss rate. // Control locality of bloom filter probes to improve cache miss rate.
// This option only applies to memtable prefix bloom and plaintable // This option only applies to memtable prefix bloom and plaintable
// prefix bloom. It essentially limits the max number of cache lines each // prefix bloom. It essentially limits the max number of cache lines each

@ -96,12 +96,19 @@ extern TableFactory* NewBlockBasedTableFactory(
// in the hash table // in the hash table
// @index_sparseness: inside each prefix, need to build one index record for how // @index_sparseness: inside each prefix, need to build one index record for how
// many keys for binary search inside each hash bucket. // many keys for binary search inside each hash bucket.
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
const uint32_t kPlainTableVariableLength = 0; const uint32_t kPlainTableVariableLength = 0;
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len = extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
kPlainTableVariableLength, kPlainTableVariableLength,
int bloom_bits_per_prefix = 10, int bloom_bits_per_prefix = 10,
double hash_table_ratio = 0.75, double hash_table_ratio = 0.75,
size_t index_sparseness = 16); size_t index_sparseness = 16,
size_t huge_page_tlb_size = 0);
// -- Plain Table // -- Plain Table
// This factory of plain table ignores Options.prefix_extractor and assumes no // This factory of plain table ignores Options.prefix_extractor and assumes no
@ -115,9 +122,15 @@ extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
// disable it by passing a zero. // disable it by passing a zero.
// @index_sparseness: need to build one index record for how many keys for // @index_sparseness: need to build one index record for how many keys for
// binary search. // binary search.
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
extern TableFactory* NewTotalOrderPlainTableFactory( extern TableFactory* NewTotalOrderPlainTableFactory(
uint32_t user_key_len = kPlainTableVariableLength, uint32_t user_key_len = kPlainTableVariableLength,
int bloom_bits_per_key = 0, size_t index_sparseness = 16); int bloom_bits_per_key = 0, size_t index_sparseness = 16,
size_t huge_page_tlb_size = 0);
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE

@ -22,7 +22,8 @@ Status PlainTableFactory::NewTableReader(const Options& options,
unique_ptr<TableReader>* table) const { unique_ptr<TableReader>* table) const {
return PlainTableReader::Open(options, soptions, icomp, std::move(file), return PlainTableReader::Open(options, soptions, icomp, std::move(file),
file_size, table, bloom_bits_per_key_, file_size, table, bloom_bits_per_key_,
hash_table_ratio_, index_sparseness_); hash_table_ratio_, index_sparseness_,
huge_page_tlb_size_);
} }
TableBuilder* PlainTableFactory::NewTableBuilder( TableBuilder* PlainTableFactory::NewTableBuilder(
@ -34,16 +35,19 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
int bloom_bits_per_key, int bloom_bits_per_key,
double hash_table_ratio, double hash_table_ratio,
size_t index_sparseness) { size_t index_sparseness,
size_t huge_page_tlb_size) {
return new PlainTableFactory(user_key_len, bloom_bits_per_key, return new PlainTableFactory(user_key_len, bloom_bits_per_key,
hash_table_ratio, index_sparseness); hash_table_ratio, index_sparseness,
huge_page_tlb_size);
} }
extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len, extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
int bloom_bits_per_key, int bloom_bits_per_key,
size_t index_sparseness) { size_t index_sparseness,
size_t huge_page_tlb_size) {
return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0, return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
index_sparseness); index_sparseness, huge_page_tlb_size);
} }
} // namespace rocksdb } // namespace rocksdb

@ -56,14 +56,19 @@ class PlainTableFactory : public TableFactory {
// inside the same prefix. It will be the maximum number of linear search // inside the same prefix. It will be the maximum number of linear search
// required after hash and binary search. // required after hash and binary search.
// index_sparseness = 0 means index for every key. // index_sparseness = 0 means index for every key.
// huge_page_tlb_size determines whether to allocate hash indexes from huge
// page TLB and the page size if allocating from there. See comments of
// Arena::AllocateAligned() for details.
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
int bloom_bits_per_key = 0, int bloom_bits_per_key = 0,
double hash_table_ratio = 0.75, double hash_table_ratio = 0.75,
size_t index_sparseness = 16) size_t index_sparseness = 16,
size_t huge_page_tlb_size = 2 * 1024 * 1024)
: user_key_len_(user_key_len), : user_key_len_(user_key_len),
bloom_bits_per_key_(bloom_bits_per_key), bloom_bits_per_key_(bloom_bits_per_key),
hash_table_ratio_(hash_table_ratio), hash_table_ratio_(hash_table_ratio),
index_sparseness_(index_sparseness) {} index_sparseness_(index_sparseness),
huge_page_tlb_size_(huge_page_tlb_size) {}
const char* Name() const override { return "PlainTable"; } const char* Name() const override { return "PlainTable"; }
Status NewTableReader(const Options& options, const EnvOptions& soptions, Status NewTableReader(const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
@ -82,6 +87,7 @@ class PlainTableFactory : public TableFactory {
int bloom_bits_per_key_; int bloom_bits_per_key_;
double hash_table_ratio_; double hash_table_ratio_;
size_t index_sparseness_; size_t index_sparseness_;
size_t huge_page_tlb_size_;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -24,6 +24,7 @@
#include "table/two_level_iterator.h" #include "table/two_level_iterator.h"
#include "table/plain_table_factory.h" #include "table/plain_table_factory.h"
#include "util/arena.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/dynamic_bloom.h" #include "util/dynamic_bloom.h"
#include "util/hash.h" #include "util/hash.h"
@ -95,7 +96,8 @@ PlainTableReader::PlainTableReader(
const Options& options, unique_ptr<RandomAccessFile>&& file, const Options& options, unique_ptr<RandomAccessFile>&& file,
const EnvOptions& storage_options, const InternalKeyComparator& icomparator, const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
size_t index_sparseness, const TableProperties* table_properties) size_t index_sparseness, const TableProperties* table_properties,
size_t huge_page_tlb_size)
: options_(options), : options_(options),
soptions_(storage_options), soptions_(storage_options),
file_(std::move(file)), file_(std::move(file)),
@ -106,19 +108,23 @@ PlainTableReader::PlainTableReader(
kIndexIntervalForSamePrefixKeys(index_sparseness), kIndexIntervalForSamePrefixKeys(index_sparseness),
table_properties_(nullptr), table_properties_(nullptr),
data_end_offset_(table_properties->data_size), data_end_offset_(table_properties->data_size),
user_key_len_(table_properties->fixed_key_len) { user_key_len_(table_properties->fixed_key_len),
huge_page_tlb_size_(huge_page_tlb_size) {
assert(kHashTableRatio >= 0.0); assert(kHashTableRatio >= 0.0);
} }
PlainTableReader::~PlainTableReader() { PlainTableReader::~PlainTableReader() {
} }
Status PlainTableReader::Open( Status PlainTableReader::Open(const Options& options,
const Options& options, const EnvOptions& soptions, const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, unique_ptr<RandomAccessFile>&& file,
unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key, uint64_t file_size,
double hash_table_ratio, size_t index_sparseness) { unique_ptr<TableReader>* table_reader,
const int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness,
size_t huge_page_tlb_size) {
assert(options.allow_mmap_reads); assert(options.allow_mmap_reads);
if (file_size > kMaxFileSize) { if (file_size > kMaxFileSize) {
@ -134,7 +140,8 @@ Status PlainTableReader::Open(
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
options, std::move(file), soptions, internal_comparator, file_size, options, std::move(file), soptions, internal_comparator, file_size,
bloom_bits_per_key, hash_table_ratio, index_sparseness, props)); bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
huge_page_tlb_size));
// -- Populate Index // -- Populate Index
s = new_reader->PopulateIndex(props); s = new_reader->PopulateIndex(props);
@ -261,12 +268,11 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
} }
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
index_.reset();
if (options_.prefix_extractor.get() != nullptr) { if (options_.prefix_extractor.get() != nullptr) {
uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
if (bloom_total_bits > 0) { if (bloom_total_bits > 0) {
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality)); bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
6, nullptr, huge_page_tlb_size_));
} }
} }
@ -278,7 +284,6 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
double hash_table_size_multipier = 1.0 / kHashTableRatio; double hash_table_size_multipier = 1.0 / kHashTableRatio;
index_size_ = num_prefixes * hash_table_size_multipier + 1; index_size_ = num_prefixes * hash_table_size_multipier + 1;
} }
index_.reset(new uint32_t[index_size_]);
} }
size_t PlainTableReader::BucketizeIndexesAndFillBloom( size_t PlainTableReader::BucketizeIndexesAndFillBloom(
@ -322,7 +327,12 @@ void PlainTableReader::FillIndexes(
const std::vector<uint32_t>& entries_per_bucket) { const std::vector<uint32_t>& entries_per_bucket) {
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
kSubIndexSize); kSubIndexSize);
sub_index_.reset(new char[kSubIndexSize]); auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
char* allocated =
arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_);
index_ = reinterpret_cast<uint32_t*>(allocated);
sub_index_ = allocated + sizeof(uint32_t) * index_size_;
size_t sub_index_offset = 0; size_t sub_index_offset = 0;
for (int i = 0; i < index_size_; i++) { for (int i = 0; i < index_size_; i++) {
uint32_t num_keys_for_bucket = entries_per_bucket[i]; uint32_t num_keys_for_bucket = entries_per_bucket[i];
@ -387,7 +397,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
if (IsTotalOrderMode()) { if (IsTotalOrderMode()) {
uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
if (num_bloom_bits > 0) { if (num_bloom_bits > 0) {
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality)); bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
nullptr, huge_page_tlb_size_));
} }
} }

@ -19,6 +19,7 @@
#include "rocksdb/table_properties.h" #include "rocksdb/table_properties.h"
#include "table/table_reader.h" #include "table/table_reader.h"
#include "table/plain_table_factory.h" #include "table/plain_table_factory.h"
#include "util/arena.h"
namespace rocksdb { namespace rocksdb {
@ -52,7 +53,7 @@ class PlainTableReader: public TableReader {
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table, unique_ptr<TableReader>* table,
const int bloom_bits_per_key, double hash_table_ratio, const int bloom_bits_per_key, double hash_table_ratio,
size_t index_sparseness); size_t index_sparseness, size_t huge_page_tlb_size);
Iterator* NewIterator(const ReadOptions&); Iterator* NewIterator(const ReadOptions&);
@ -74,7 +75,8 @@ class PlainTableReader: public TableReader {
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
uint64_t file_size, int bloom_num_bits, uint64_t file_size, int bloom_num_bits,
double hash_table_ratio, size_t index_sparseness, double hash_table_ratio, size_t index_sparseness,
const TableProperties* table_properties); const TableProperties* table_properties,
size_t huge_page_tlb_size);
virtual ~PlainTableReader(); virtual ~PlainTableReader();
protected: protected:
@ -136,9 +138,9 @@ class PlainTableReader: public TableReader {
// For more details about the in-memory index, please refer to: // For more details about the in-memory index, please refer to:
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
// #wiki-in-memory-index-format // #wiki-in-memory-index-format
std::unique_ptr<uint32_t[]> index_; uint32_t* index_;
int index_size_ = 0; int index_size_ = 0;
std::unique_ptr<char[]> sub_index_; char* sub_index_;
Options options_; Options options_;
const EnvOptions& soptions_; const EnvOptions& soptions_;
@ -159,6 +161,7 @@ class PlainTableReader: public TableReader {
const size_t kIndexIntervalForSamePrefixKeys = 16; const size_t kIndexIntervalForSamePrefixKeys = 16;
// Bloom filter is used to rule out non-existent key // Bloom filter is used to rule out non-existent key
unique_ptr<DynamicBloom> bloom_; unique_ptr<DynamicBloom> bloom_;
Arena arena_;
std::shared_ptr<const TableProperties> table_properties_; std::shared_ptr<const TableProperties> table_properties_;
// data_start_offset_ and data_end_offset_ defines the range of the // data_start_offset_ and data_end_offset_ defines the range of the
@ -166,6 +169,7 @@ class PlainTableReader: public TableReader {
const uint32_t data_start_offset_ = 0; const uint32_t data_start_offset_ = 0;
const uint32_t data_end_offset_; const uint32_t data_end_offset_;
const size_t user_key_len_; const size_t user_key_len_;
const size_t huge_page_tlb_size_;
static const size_t kNumInternalBytes = 8; static const size_t kNumInternalBytes = 8;
static const uint32_t kSubIndexMask = 0x80000000; static const uint32_t kSubIndexMask = 0x80000000;

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/arena.h" #include "util/arena.h"
#include <sys/mman.h>
#include <algorithm> #include <algorithm>
namespace rocksdb { namespace rocksdb {
@ -38,6 +39,13 @@ Arena::~Arena() {
for (const auto& block : blocks_) { for (const auto& block : blocks_) {
delete[] block; delete[] block;
} }
for (const auto& mmap_info : huge_blocks_) {
auto ret = munmap(mmap_info.addr_, mmap_info.length_);
if (ret != 0) {
// TODO(sdong): Better handling
perror("munmap");
}
}
} }
char* Arena::AllocateFallback(size_t bytes, bool aligned) { char* Arena::AllocateFallback(size_t bytes, bool aligned) {
@ -63,9 +71,29 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
} }
} }
char* Arena::AllocateAligned(size_t bytes) { char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) {
assert((kAlignUnit & (kAlignUnit - 1)) == assert((kAlignUnit & (kAlignUnit - 1)) ==
0); // Pointer size should be a power of 2 0); // Pointer size should be a power of 2
#ifdef OS_LINUX
if (huge_page_tlb_size > 0 && bytes > 0) {
// Allocate from a huge page TBL table.
size_t reserved_size =
((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size;
assert(reserved_size >= bytes);
void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE),
(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
if (addr == MAP_FAILED) {
perror("mmap");
// fail back to malloc
} else {
blocks_memory_ += reserved_size;
huge_blocks_.push_back(MmapInfo(addr, reserved_size));
return reinterpret_cast<char*>(addr);
}
}
#endif
size_t current_mod = size_t current_mod =
reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1); reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);

@ -34,7 +34,14 @@ class Arena {
char* Allocate(size_t bytes); char* Allocate(size_t bytes);
char* AllocateAligned(size_t bytes); // huge_page_tlb_size: if >0, allocate bytes from huge page TLB and the size
// of the huge page TLB. Bytes will be rounded up to multiple and 2MB and
// allocate huge pages through mmap anonymous option with huge page on.
// The extra space allocated will be wasted. To enable it, need to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt for details.
char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0);
// Returns an estimate of the total memory usage of data allocated // Returns an estimate of the total memory usage of data allocated
// by the arena (exclude the space allocated but not yet used for future // by the arena (exclude the space allocated but not yet used for future
@ -60,6 +67,14 @@ class Arena {
// Array of new[] allocated memory blocks // Array of new[] allocated memory blocks
typedef std::vector<char*> Blocks; typedef std::vector<char*> Blocks;
Blocks blocks_; Blocks blocks_;
struct MmapInfo {
void* addr_;
size_t length_;
MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
};
std::vector<MmapInfo> huge_blocks_;
size_t irregular_block_num = 0; size_t irregular_block_num = 0;
// Stats for current active block. // Stats for current active block.

@ -19,18 +19,19 @@ static uint32_t BloomHash(const Slice& key) {
} }
} }
DynamicBloom::DynamicBloom(uint32_t total_bits, DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
uint32_t cl_per_block,
uint32_t num_probes, uint32_t num_probes,
uint32_t (*hash_func)(const Slice& key)) uint32_t (*hash_func)(const Slice& key),
: kBlocked(cl_per_block > 0), size_t huge_page_tlb_size)
kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8), : kBlocked(cl_per_block > 0),
kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
* kBitsPerBlock : kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock *
total_bits + 7) / 8 * 8), kBitsPerBlock
kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1), : total_bits + 7) /
kNumProbes(num_probes), 8 * 8),
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) { kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
kNumProbes(num_probes),
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock); assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
assert(kNumProbes > 0); assert(kNumProbes > 0);
@ -38,7 +39,9 @@ DynamicBloom::DynamicBloom(uint32_t total_bits,
if (kBlocked) { if (kBlocked) {
sz += CACHE_LINE_SIZE - 1; sz += CACHE_LINE_SIZE - 1;
} }
raw_ = new unsigned char[sz](); raw_ = reinterpret_cast<unsigned char*>(
arena_.AllocateAligned(sz, huge_page_tlb_size));
memset(raw_, 0, sz);
if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) { if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
data_ = raw_ + CACHE_LINE_SIZE - data_ = raw_ + CACHE_LINE_SIZE -
reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE; reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;

@ -8,6 +8,8 @@
#include <atomic> #include <atomic>
#include <memory> #include <memory>
#include <util/arena.h>
namespace rocksdb { namespace rocksdb {
class Slice; class Slice;
@ -19,13 +21,17 @@ class DynamicBloom {
// cl_per_block: block size in cache lines. When this is non-zero, a // cl_per_block: block size in cache lines. When this is non-zero, a
// query/set is done within a block to improve cache locality. // query/set is done within a block to improve cache locality.
// hash_func: customized hash function // hash_func: customized hash function
// huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
// withi this page size. Need to reserve huge pages for
// it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0, explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
uint32_t num_probes = 6, uint32_t num_probes = 6,
uint32_t (*hash_func)(const Slice& key) = nullptr); uint32_t (*hash_func)(const Slice& key) = nullptr,
size_t huge_page_tlb_size = 0);
~DynamicBloom() { ~DynamicBloom() {}
delete[] raw_;
}
// Assuming single threaded access to this function. // Assuming single threaded access to this function.
void Add(const Slice& key); void Add(const Slice& key);
@ -49,6 +55,8 @@ class DynamicBloom {
uint32_t (*hash_func_)(const Slice& key); uint32_t (*hash_func_)(const Slice& key);
unsigned char* data_; unsigned char* data_;
unsigned char* raw_; unsigned char* raw_;
Arena arena_;
}; };
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); } inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }

@ -53,7 +53,8 @@ struct Node {
class HashLinkListRep : public MemTableRep { class HashLinkListRep : public MemTableRep {
public: public:
HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
const SliceTransform* transform, size_t bucket_size); const SliceTransform* transform, size_t bucket_size,
size_t huge_page_tlb_size);
virtual KeyHandle Allocate(const size_t len, char** buf) override; virtual KeyHandle Allocate(const size_t len, char** buf) override;
@ -306,13 +307,13 @@ class HashLinkListRep : public MemTableRep {
HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
Arena* arena, const SliceTransform* transform, Arena* arena, const SliceTransform* transform,
size_t bucket_size) size_t bucket_size, size_t huge_page_tlb_size)
: MemTableRep(arena), : MemTableRep(arena),
bucket_size_(bucket_size), bucket_size_(bucket_size),
transform_(transform), transform_(transform),
compare_(compare) { compare_(compare) {
char* mem = arena_->AllocateAligned( char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size,
sizeof(port::AtomicPointer) * bucket_size); huge_page_tlb_size);
buckets_ = new (mem) port::AtomicPointer[bucket_size]; buckets_ = new (mem) port::AtomicPointer[bucket_size];
@ -469,11 +470,13 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
MemTableRep* HashLinkListRepFactory::CreateMemTableRep( MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
const MemTableRep::KeyComparator& compare, Arena* arena, const MemTableRep::KeyComparator& compare, Arena* arena,
const SliceTransform* transform) { const SliceTransform* transform) {
return new HashLinkListRep(compare, arena, transform, bucket_count_); return new HashLinkListRep(compare, arena, transform, bucket_count_,
huge_page_tlb_size_);
} }
MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) { MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count,
return new HashLinkListRepFactory(bucket_count); size_t huge_page_tlb_size) {
return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size);
} }
} // namespace rocksdb } // namespace rocksdb

@ -15,8 +15,9 @@ namespace rocksdb {
class HashLinkListRepFactory : public MemTableRepFactory { class HashLinkListRepFactory : public MemTableRepFactory {
public: public:
explicit HashLinkListRepFactory(size_t bucket_count) explicit HashLinkListRepFactory(size_t bucket_count,
: bucket_count_(bucket_count) { } size_t huge_page_tlb_size)
: bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {}
virtual ~HashLinkListRepFactory() {} virtual ~HashLinkListRepFactory() {}
@ -30,6 +31,7 @@ class HashLinkListRepFactory : public MemTableRepFactory {
private: private:
const size_t bucket_count_; const size_t bucket_count_;
const size_t huge_page_tlb_size_;
}; };
} }

@ -32,8 +32,7 @@ ColumnFamilyOptions::ColumnFamilyOptions()
compaction_filter(nullptr), compaction_filter(nullptr),
compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>( compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
new DefaultCompactionFilterFactory())), new DefaultCompactionFilterFactory())),
compaction_filter_factory_v2( compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()),
new DefaultCompactionFilterFactoryV2()),
write_buffer_size(4 << 20), write_buffer_size(4 << 20),
max_write_buffer_number(2), max_write_buffer_number(2),
min_write_buffer_number_to_merge(1), min_write_buffer_number_to_merge(1),
@ -79,6 +78,7 @@ ColumnFamilyOptions::ColumnFamilyOptions()
inplace_callback(nullptr), inplace_callback(nullptr),
memtable_prefix_bloom_bits(0), memtable_prefix_bloom_bits(0),
memtable_prefix_bloom_probes(6), memtable_prefix_bloom_probes(6),
memtable_prefix_bloom_huge_page_tlb_size(0),
bloom_locality(0), bloom_locality(0),
max_successive_merges(0), max_successive_merges(0),
min_partial_merge_operands(2) { min_partial_merge_operands(2) {
@ -144,6 +144,8 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
inplace_callback(options.inplace_callback), inplace_callback(options.inplace_callback),
memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
memtable_prefix_bloom_huge_page_tlb_size(
options.memtable_prefix_bloom_huge_page_tlb_size),
bloom_locality(options.bloom_locality), bloom_locality(options.bloom_locality),
max_successive_merges(options.max_successive_merges), max_successive_merges(options.max_successive_merges),
min_partial_merge_operands(options.min_partial_merge_operands) { min_partial_merge_operands(options.min_partial_merge_operands) {
@ -423,6 +425,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
memtable_prefix_bloom_bits); memtable_prefix_bloom_bits);
Log(log, " Options.memtable_prefix_bloom_probes: %d", Log(log, " Options.memtable_prefix_bloom_probes: %d",
memtable_prefix_bloom_probes); memtable_prefix_bloom_probes);
Log(log, " Options.memtable_prefix_bloom_huge_page_tlb_size: %zu",
memtable_prefix_bloom_huge_page_tlb_size);
Log(log, " Options.bloom_locality: %d", Log(log, " Options.bloom_locality: %d",
bloom_locality); bloom_locality);
Log(log, " Options.max_successive_merges: %zd", Log(log, " Options.max_successive_merges: %zd",

Loading…
Cancel
Save