Revert "Allow allocating dynamic bloom, plain table indexes and hash linked list from huge page TLB"

This reverts commit 7dafa3a1d7.
main
Igor Canadi 11 years ago
parent 41e5cf2392
commit d69dc64be7
  1. 2
      db/db_test.cc
  2. 7
      db/memtable.cc
  3. 631
      db/plain_table_db_test.cc
  4. 8
      db/prefix_test.cc
  5. 9
      include/rocksdb/memtablerep.h
  6. 8
      include/rocksdb/options.h
  7. 17
      include/rocksdb/table.h
  8. 14
      table/plain_table_factory.cc
  9. 10
      table/plain_table_factory.h
  10. 41
      table/plain_table_reader.cc
  11. 12
      table/plain_table_reader.h
  12. 30
      util/arena.cc
  13. 17
      util/arena.h
  14. 27
      util/dynamic_bloom.cc
  15. 18
      util/dynamic_bloom.h
  16. 25
      util/hash_linklist_rep.cc
  17. 6
      util/hash_linklist_rep.h
  18. 8
      util/options.cc

@ -481,7 +481,7 @@ class DBTest {
break;
case kHashLinkList:
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
options.memtable_factory.reset(NewHashLinkListRepFactory(4));
break;
case kHashCuckoo:
options.memtable_factory.reset(

@ -52,10 +52,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
// gone wrong already.
assert(!should_flush_);
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
prefix_bloom_.reset(new DynamicBloom(
options.memtable_prefix_bloom_bits, options.bloom_locality,
options.memtable_prefix_bloom_probes, nullptr,
options.memtable_prefix_bloom_huge_page_tlb_size));
prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
options.bloom_locality,
options.memtable_prefix_bloom_probes));
}
}

@ -185,7 +185,7 @@ class TestPlainTableReader : public PlainTableReader {
const Options& options, bool* expect_bloom_not_match)
: PlainTableReader(options, std::move(file), storage_options, icomparator,
file_size, bloom_bits_per_key, hash_table_ratio,
index_sparseness, table_properties, 2 * 1024 * 1024),
index_sparseness, table_properties),
expect_bloom_not_match_(expect_bloom_not_match) {
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
ASSERT_TRUE(s.ok());
@ -206,12 +206,13 @@ extern const uint64_t kPlainTableMagicNumber;
class TestPlainTableFactory : public PlainTableFactory {
public:
explicit TestPlainTableFactory(bool* expect_bloom_not_match,
uint32_t user_key_len, int bloom_bits_per_key,
double hash_table_ratio,
size_t index_sparseness,
size_t huge_page_tlb_size)
uint32_t user_key_len =
kPlainTableVariableLength,
int bloom_bits_per_key = 0,
double hash_table_ratio = 0.75,
size_t index_sparseness = 16)
: PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
index_sparseness, huge_page_tlb_size),
hash_table_ratio),
bloom_bits_per_key_(bloom_bits_per_key),
hash_table_ratio_(hash_table_ratio),
index_sparseness_(index_sparseness),
@ -243,209 +244,197 @@ class TestPlainTableFactory : public PlainTableFactory {
};
TEST(PlainTableDBTest, Flush) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.table_factory.reset(NewTotalOrderPlainTableFactory(
16, bloom_bits, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(NewPlainTableFactory(
16, bloom_bits, 0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
TablePropertiesCollection ptc;
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
ASSERT_EQ(1U, ptc.size());
auto row = ptc.begin();
auto tp = row->second;
ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
"plain_table_hash_table_size"));
ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
"plain_table_sub_index_size"));
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.table_factory.reset(
NewTotalOrderPlainTableFactory(16, bloom_bits, 2));
} else {
options.table_factory.reset(NewPlainTableFactory(16, bloom_bits));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
TablePropertiesCollection ptc;
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
ASSERT_EQ(1U, ptc.size());
auto row = ptc.begin();
auto tp = row->second;
ASSERT_EQ(
total_order ? "4" : "12",
(tp->user_collected_properties).at("plain_table_hash_table_size"));
ASSERT_EQ(
total_order ? "9" : "0",
(tp->user_collected_properties).at("plain_table_sub_index_size"));
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
}
}
TEST(PlainTableDBTest, Flush2) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
bool expect_bloom_not_match = false;
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.prefix_extractor = nullptr;
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("1000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("1000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("1000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
ASSERT_OK(Put("9000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
// Test Bloom Filter
if (bloom_bits > 0) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
bool expect_bloom_not_match = false;
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.prefix_extractor = nullptr;
options.table_factory.reset(new TestPlainTableFactory(
&expect_bloom_not_match, 16, bloom_bits, 0, 2));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("1000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
// Key doesn't exist any more but prefix exists.
if (total_order) {
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
}
expect_bloom_not_match = false;
ASSERT_OK(Put("1000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("1000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
ASSERT_OK(Put("9000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
// Test Bloom Filter
if (bloom_bits > 0) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
// Key doesn't exist any more but prefix exists.
if (total_order) {
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
}
expect_bloom_not_match = false;
}
}
}
}
TEST(PlainTableDBTest, Iterator) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
bool expect_bloom_not_match = false;
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.prefix_extractor = nullptr;
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000foo002", "v_2"));
ASSERT_OK(Put("0000000000000bar", "random"));
ASSERT_OK(Put("1000000000foo001", "v1"));
ASSERT_OK(Put("3000000000000bar", "bar_v"));
ASSERT_OK(Put("1000000000foo003", "v__3"));
ASSERT_OK(Put("1000000000foo004", "v__4"));
ASSERT_OK(Put("1000000000foo005", "v__5"));
ASSERT_OK(Put("1000000000foo007", "v__7"));
ASSERT_OK(Put("1000000000foo008", "v__8"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("1000000000foo001"));
ASSERT_EQ("v__3", Get("1000000000foo003"));
Iterator* iter = dbfull()->NewIterator(ReadOptions());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
bool expect_bloom_not_match = false;
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.prefix_extractor = nullptr;
options.table_factory.reset(new TestPlainTableFactory(
&expect_bloom_not_match, 16, bloom_bits, 0, 2));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
}
DestroyAndReopen(&options);
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo002", iter->key().ToString());
ASSERT_EQ("v_2", iter->value().ToString());
ASSERT_OK(Put("1000000000foo002", "v_2"));
ASSERT_OK(Put("0000000000000bar", "random"));
ASSERT_OK(Put("1000000000foo001", "v1"));
ASSERT_OK(Put("3000000000000bar", "bar_v"));
ASSERT_OK(Put("1000000000foo003", "v__3"));
ASSERT_OK(Put("1000000000foo004", "v__4"));
ASSERT_OK(Put("1000000000foo005", "v__5"));
ASSERT_OK(Put("1000000000foo007", "v__7"));
ASSERT_OK(Put("1000000000foo008", "v__8"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("1000000000foo001"));
ASSERT_EQ("v__3", Get("1000000000foo003"));
Iterator* iter = dbfull()->NewIterator(ReadOptions());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo003", iter->key().ToString());
ASSERT_EQ("v__3", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo002", iter->key().ToString());
ASSERT_EQ("v_2", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo004", iter->key().ToString());
ASSERT_EQ("v__4", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo003", iter->key().ToString());
ASSERT_EQ("v__3", iter->value().ToString());
iter->Seek("3000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
ASSERT_EQ("bar_v", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo004", iter->key().ToString());
ASSERT_EQ("v__4", iter->value().ToString());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
iter->Seek("3000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
ASSERT_EQ("bar_v", iter->value().ToString());
iter->Seek("1000000000foo005");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
iter->Seek("1000000000foo006");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo007", iter->key().ToString());
ASSERT_EQ("v__7", iter->value().ToString());
iter->Seek("1000000000foo005");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString());
iter->Seek("1000000000foo008");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo008", iter->key().ToString());
ASSERT_EQ("v__8", iter->value().ToString());
iter->Seek("1000000000foo006");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo007", iter->key().ToString());
ASSERT_EQ("v__7", iter->value().ToString());
if (total_order == 0) {
iter->Seek("1000000000foo009");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
}
iter->Seek("1000000000foo008");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo008", iter->key().ToString());
ASSERT_EQ("v__8", iter->value().ToString());
// Test Bloom Filter
if (bloom_bits > 0) {
if (!total_order) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
iter->Seek("2not000000000bar");
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
} else {
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
}
}
if (total_order == 0) {
iter->Seek("1000000000foo009");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
}
delete iter;
// Test Bloom Filter
if (bloom_bits > 0) {
if (!total_order) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
iter->Seek("2not000000000bar");
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
} else {
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
}
}
delete iter;
}
}
}
@ -592,173 +581,165 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
}
TEST(PlainTableDBTest, HashBucketConflict) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (unsigned char i = 1; i <= 3; i++) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
DestroyAndReopen(&options);
ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo0");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
for (unsigned char i = 1; i <= 3; i++) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
DestroyAndReopen(&options);
ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo0");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo0");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo0");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("2000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("2000000000000fo8");
ASSERT_TRUE(!iter->Valid() ||
options.comparator->Compare(iter->key(), "20000001") > 0);
iter->Seek("2000000000000fo8");
ASSERT_TRUE(!iter->Valid() ||
options.comparator->Compare(iter->key(), "20000001") > 0);
iter->Seek("5000000000000fo8");
ASSERT_TRUE(!iter->Valid());
iter->Seek("5000000000000fo8");
ASSERT_TRUE(!iter->Valid());
iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
delete iter;
}
}
TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (unsigned char i = 1; i <= 3; i++) {
Options options = CurrentOptions();
options.create_if_missing = true;
SimpleSuffixReverseComparator comp;
options.comparator = &comp;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
DestroyAndReopen(&options);
ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
for (unsigned char i = 1; i <= 3; i++) {
Options options = CurrentOptions();
options.create_if_missing = true;
SimpleSuffixReverseComparator comp;
options.comparator = &comp;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
DestroyAndReopen(&options);
ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000var");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
iter->Seek("2000000000000var");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
iter->Seek("5000000000000var");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo2", iter->key().ToString());
iter->Seek("5000000000000var");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo2", iter->key().ToString());
std::string seek_key = "2000000000000bar";
iter->Seek(seek_key);
ASSERT_TRUE(!iter->Valid() ||
options.prefix_extractor->Transform(iter->key()) !=
options.prefix_extractor->Transform(seek_key));
std::string seek_key = "2000000000000bar";
iter->Seek(seek_key);
ASSERT_TRUE(!iter->Valid() ||
options.prefix_extractor->Transform(iter->key()) !=
options.prefix_extractor->Transform(seek_key));
iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
delete iter;
}
}

@ -30,7 +30,6 @@ DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
DEFINE_int32(skiplist_height, 4, "");
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
DEFINE_int32(value_size, 40, "");
// Path to the database on file system
@ -149,8 +148,6 @@ class PrefixTest {
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
options.memtable_prefix_bloom_huge_page_tlb_size =
FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
Status s = DB::Open(options, kDbName, &db);
ASSERT_OK(s);
@ -175,10 +172,6 @@ class PrefixTest {
options.memtable_factory.reset(
NewHashLinkListRepFactory(bucket_count));
return true;
case kHashLinkListHugePageTlb:
options.memtable_factory.reset(
NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
return true;
default:
return false;
}
@ -197,7 +190,6 @@ class PrefixTest {
kBegin,
kHashSkipList,
kHashLinkList,
kHashLinkListHugePageTlb,
kEnd
};
int option_config_;

@ -223,14 +223,9 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
// The factory is to create memtables with a hashed linked list:
// it contains a fixed array of buckets, each pointing to a sorted single
// linked list (null if the bucket is empty).
// @bucket_count: number of fixed array buckets
// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
// bucket_count: number of fixed array buckets
extern MemTableRepFactory* NewHashLinkListRepFactory(
size_t bucket_count = 50000, size_t huge_page_tlb_size = 2 * 1024 * 1024);
size_t bucket_count = 50000);
// This factory creates a cuckoo-hashing based mem-table representation.
// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs

@ -498,14 +498,6 @@ struct ColumnFamilyOptions {
// number of hash probes per key
uint32_t memtable_prefix_bloom_probes;
// Page size for huge page TLB for bloom in memtable. If <=0, not allocate
// from huge page TLB but from malloc.
// Need to reserve huge pages for it to be allocated. For example:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
size_t memtable_prefix_bloom_huge_page_tlb_size;
// Control locality of bloom filter probes to improve cache miss rate.
// This option only applies to memtable prefix bloom and plaintable
// prefix bloom. It essentially limits the max number of cache lines each

@ -107,19 +107,12 @@ extern TableFactory* NewBlockBasedTableFactory(
// in the hash table
// @index_sparseness: inside each prefix, need to build one index record for how
// many keys for binary search inside each hash bucket.
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
const uint32_t kPlainTableVariableLength = 0;
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
kPlainTableVariableLength,
int bloom_bits_per_prefix = 10,
double hash_table_ratio = 0.75,
size_t index_sparseness = 16,
size_t huge_page_tlb_size = 0);
size_t index_sparseness = 16);
// -- Plain Table
// This factory of plain table ignores Options.prefix_extractor and assumes no
@ -133,15 +126,9 @@ extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
// disable it by passing a zero.
// @index_sparseness: need to build one index record for how many keys for
// binary search.
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
extern TableFactory* NewTotalOrderPlainTableFactory(
uint32_t user_key_len = kPlainTableVariableLength,
int bloom_bits_per_key = 0, size_t index_sparseness = 16,
size_t huge_page_tlb_size = 0);
int bloom_bits_per_key = 0, size_t index_sparseness = 16);
#endif // ROCKSDB_LITE

@ -22,8 +22,7 @@ Status PlainTableFactory::NewTableReader(const Options& options,
unique_ptr<TableReader>* table) const {
return PlainTableReader::Open(options, soptions, icomp, std::move(file),
file_size, table, bloom_bits_per_key_,
hash_table_ratio_, index_sparseness_,
huge_page_tlb_size_);
hash_table_ratio_, index_sparseness_);
}
TableBuilder* PlainTableFactory::NewTableBuilder(
@ -35,19 +34,16 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
int bloom_bits_per_key,
double hash_table_ratio,
size_t index_sparseness,
size_t huge_page_tlb_size) {
size_t index_sparseness) {
return new PlainTableFactory(user_key_len, bloom_bits_per_key,
hash_table_ratio, index_sparseness,
huge_page_tlb_size);
hash_table_ratio, index_sparseness);
}
extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
int bloom_bits_per_key,
size_t index_sparseness,
size_t huge_page_tlb_size) {
size_t index_sparseness) {
return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
index_sparseness, huge_page_tlb_size);
index_sparseness);
}
} // namespace rocksdb

@ -56,19 +56,14 @@ class PlainTableFactory : public TableFactory {
// inside the same prefix. It will be the maximum number of linear search
// required after hash and binary search.
// index_sparseness = 0 means index for every key.
// huge_page_tlb_size determines whether to allocate hash indexes from huge
// page TLB and the page size if allocating from there. See comments of
// Arena::AllocateAligned() for details.
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
int bloom_bits_per_key = 0,
double hash_table_ratio = 0.75,
size_t index_sparseness = 16,
size_t huge_page_tlb_size = 2 * 1024 * 1024)
size_t index_sparseness = 16)
: user_key_len_(user_key_len),
bloom_bits_per_key_(bloom_bits_per_key),
hash_table_ratio_(hash_table_ratio),
index_sparseness_(index_sparseness),
huge_page_tlb_size_(huge_page_tlb_size) {}
index_sparseness_(index_sparseness) {}
const char* Name() const override { return "PlainTable"; }
Status NewTableReader(const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator,
@ -87,7 +82,6 @@ class PlainTableFactory : public TableFactory {
int bloom_bits_per_key_;
double hash_table_ratio_;
size_t index_sparseness_;
size_t huge_page_tlb_size_;
};
} // namespace rocksdb

@ -24,7 +24,6 @@
#include "table/two_level_iterator.h"
#include "table/plain_table_factory.h"
#include "util/arena.h"
#include "util/coding.h"
#include "util/dynamic_bloom.h"
#include "util/hash.h"
@ -96,8 +95,7 @@ PlainTableReader::PlainTableReader(
const Options& options, unique_ptr<RandomAccessFile>&& file,
const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
size_t index_sparseness, const TableProperties* table_properties,
size_t huge_page_tlb_size)
size_t index_sparseness, const TableProperties* table_properties)
: options_(options),
soptions_(storage_options),
file_(std::move(file)),
@ -108,23 +106,19 @@ PlainTableReader::PlainTableReader(
kIndexIntervalForSamePrefixKeys(index_sparseness),
table_properties_(nullptr),
data_end_offset_(table_properties->data_size),
user_key_len_(table_properties->fixed_key_len),
huge_page_tlb_size_(huge_page_tlb_size) {
user_key_len_(table_properties->fixed_key_len) {
assert(kHashTableRatio >= 0.0);
}
PlainTableReader::~PlainTableReader() {
}
Status PlainTableReader::Open(const Options& options,
const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file,
uint64_t file_size,
unique_ptr<TableReader>* table_reader,
const int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness,
size_t huge_page_tlb_size) {
Status PlainTableReader::Open(
const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness) {
assert(options.allow_mmap_reads);
if (file_size > kMaxFileSize) {
@ -140,8 +134,7 @@ Status PlainTableReader::Open(const Options& options,
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
options, std::move(file), soptions, internal_comparator, file_size,
bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
huge_page_tlb_size));
bloom_bits_per_key, hash_table_ratio, index_sparseness, props));
// -- Populate Index
s = new_reader->PopulateIndex(props);
@ -268,11 +261,12 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
}
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
index_.reset();
if (options_.prefix_extractor.get() != nullptr) {
uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
if (bloom_total_bits > 0) {
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
6, nullptr, huge_page_tlb_size_));
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality));
}
}
@ -284,6 +278,7 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
double hash_table_size_multipier = 1.0 / kHashTableRatio;
index_size_ = num_prefixes * hash_table_size_multipier + 1;
}
index_.reset(new uint32_t[index_size_]);
}
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
@ -327,12 +322,7 @@ void PlainTableReader::FillIndexes(
const std::vector<uint32_t>& entries_per_bucket) {
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
kSubIndexSize);
auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
char* allocated =
arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_);
index_ = reinterpret_cast<uint32_t*>(allocated);
sub_index_ = allocated + sizeof(uint32_t) * index_size_;
sub_index_.reset(new char[kSubIndexSize]);
size_t sub_index_offset = 0;
for (int i = 0; i < index_size_; i++) {
uint32_t num_keys_for_bucket = entries_per_bucket[i];
@ -397,8 +387,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
if (IsTotalOrderMode()) {
uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
if (num_bloom_bits > 0) {
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
nullptr, huge_page_tlb_size_));
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality));
}
}

@ -19,7 +19,6 @@
#include "rocksdb/table_properties.h"
#include "table/table_reader.h"
#include "table/plain_table_factory.h"
#include "util/arena.h"
namespace rocksdb {
@ -53,7 +52,7 @@ class PlainTableReader: public TableReader {
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table,
const int bloom_bits_per_key, double hash_table_ratio,
size_t index_sparseness, size_t huge_page_tlb_size);
size_t index_sparseness);
Iterator* NewIterator(const ReadOptions&);
@ -75,8 +74,7 @@ class PlainTableReader: public TableReader {
const InternalKeyComparator& internal_comparator,
uint64_t file_size, int bloom_num_bits,
double hash_table_ratio, size_t index_sparseness,
const TableProperties* table_properties,
size_t huge_page_tlb_size);
const TableProperties* table_properties);
virtual ~PlainTableReader();
protected:
@ -138,9 +136,9 @@ class PlainTableReader: public TableReader {
// For more details about the in-memory index, please refer to:
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
// #wiki-in-memory-index-format
uint32_t* index_;
std::unique_ptr<uint32_t[]> index_;
int index_size_ = 0;
char* sub_index_;
std::unique_ptr<char[]> sub_index_;
Options options_;
const EnvOptions& soptions_;
@ -161,7 +159,6 @@ class PlainTableReader: public TableReader {
const size_t kIndexIntervalForSamePrefixKeys = 16;
// Bloom filter is used to rule out non-existent key
unique_ptr<DynamicBloom> bloom_;
Arena arena_;
std::shared_ptr<const TableProperties> table_properties_;
// data_start_offset_ and data_end_offset_ defines the range of the
@ -169,7 +166,6 @@ class PlainTableReader: public TableReader {
const uint32_t data_start_offset_ = 0;
const uint32_t data_end_offset_;
const size_t user_key_len_;
const size_t huge_page_tlb_size_;
static const size_t kNumInternalBytes = 8;
static const uint32_t kSubIndexMask = 0x80000000;

@ -8,7 +8,6 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/arena.h"
#include <sys/mman.h>
#include <algorithm>
namespace rocksdb {
@ -39,13 +38,6 @@ Arena::~Arena() {
for (const auto& block : blocks_) {
delete[] block;
}
for (const auto& mmap_info : huge_blocks_) {
auto ret = munmap(mmap_info.addr_, mmap_info.length_);
if (ret != 0) {
// TODO(sdong): Better handling
perror("munmap");
}
}
}
char* Arena::AllocateFallback(size_t bytes, bool aligned) {
@ -71,29 +63,9 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
}
}
char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) {
char* Arena::AllocateAligned(size_t bytes) {
assert((kAlignUnit & (kAlignUnit - 1)) ==
0); // Pointer size should be a power of 2
#ifdef OS_LINUX
if (huge_page_tlb_size > 0 && bytes > 0) {
// Allocate from a huge page TBL table.
size_t reserved_size =
((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size;
assert(reserved_size >= bytes);
void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE),
(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
if (addr == MAP_FAILED) {
perror("mmap");
// fail back to malloc
} else {
blocks_memory_ += reserved_size;
huge_blocks_.push_back(MmapInfo(addr, reserved_size));
return reinterpret_cast<char*>(addr);
}
}
#endif
size_t current_mod =
reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);

@ -34,14 +34,7 @@ class Arena {
char* Allocate(size_t bytes);
// huge_page_tlb_size: if >0, allocate bytes from huge page TLB and the size
// of the huge page TLB. Bytes will be rounded up to multiple and 2MB and
// allocate huge pages through mmap anonymous option with huge page on.
// The extra space allocated will be wasted. To enable it, need to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt for details.
char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0);
char* AllocateAligned(size_t bytes);
// Returns an estimate of the total memory usage of data allocated
// by the arena (exclude the space allocated but not yet used for future
@ -67,14 +60,6 @@ class Arena {
// Array of new[] allocated memory blocks
typedef std::vector<char*> Blocks;
Blocks blocks_;
struct MmapInfo {
void* addr_;
size_t length_;
MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
};
std::vector<MmapInfo> huge_blocks_;
size_t irregular_block_num = 0;
// Stats for current active block.

@ -19,19 +19,18 @@ static uint32_t BloomHash(const Slice& key) {
}
}
DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
DynamicBloom::DynamicBloom(uint32_t total_bits,
uint32_t cl_per_block,
uint32_t num_probes,
uint32_t (*hash_func)(const Slice& key),
size_t huge_page_tlb_size)
: kBlocked(cl_per_block > 0),
kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock *
kBitsPerBlock
: total_bits + 7) /
8 * 8),
kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
kNumProbes(num_probes),
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
uint32_t (*hash_func)(const Slice& key))
: kBlocked(cl_per_block > 0),
kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock
* kBitsPerBlock :
total_bits + 7) / 8 * 8),
kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
kNumProbes(num_probes),
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
assert(kNumProbes > 0);
@ -39,9 +38,7 @@ DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
if (kBlocked) {
sz += CACHE_LINE_SIZE - 1;
}
raw_ = reinterpret_cast<unsigned char*>(
arena_.AllocateAligned(sz, huge_page_tlb_size));
memset(raw_, 0, sz);
raw_ = new unsigned char[sz]();
if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
data_ = raw_ + CACHE_LINE_SIZE -
reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;

@ -8,8 +8,6 @@
#include <atomic>
#include <memory>
#include <util/arena.h>
namespace rocksdb {
class Slice;
@ -21,17 +19,13 @@ class DynamicBloom {
// cl_per_block: block size in cache lines. When this is non-zero, a
// query/set is done within a block to improve cache locality.
// hash_func: customized hash function
// huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
// withi this page size. Need to reserve huge pages for
// it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
uint32_t num_probes = 6,
uint32_t (*hash_func)(const Slice& key) = nullptr,
size_t huge_page_tlb_size = 0);
uint32_t num_probes = 6,
uint32_t (*hash_func)(const Slice& key) = nullptr);
~DynamicBloom() {}
~DynamicBloom() {
delete[] raw_;
}
// Assuming single threaded access to this function.
void Add(const Slice& key);
@ -55,8 +49,6 @@ class DynamicBloom {
uint32_t (*hash_func_)(const Slice& key);
unsigned char* data_;
unsigned char* raw_;
Arena arena_;
};
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }

@ -53,8 +53,7 @@ struct Node {
class HashLinkListRep : public MemTableRep {
public:
HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
const SliceTransform* transform, size_t bucket_size,
size_t huge_page_tlb_size);
const SliceTransform* transform, size_t bucket_size);
virtual KeyHandle Allocate(const size_t len, char** buf) override;
@ -307,13 +306,13 @@ class HashLinkListRep : public MemTableRep {
HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
Arena* arena, const SliceTransform* transform,
size_t bucket_size, size_t huge_page_tlb_size)
: MemTableRep(arena),
bucket_size_(bucket_size),
transform_(transform),
compare_(compare) {
char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size,
huge_page_tlb_size);
size_t bucket_size)
: MemTableRep(arena),
bucket_size_(bucket_size),
transform_(transform),
compare_(compare) {
char* mem = arena_->AllocateAligned(
sizeof(port::AtomicPointer) * bucket_size);
buckets_ = new (mem) port::AtomicPointer[bucket_size];
@ -470,13 +469,11 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
const MemTableRep::KeyComparator& compare, Arena* arena,
const SliceTransform* transform) {
return new HashLinkListRep(compare, arena, transform, bucket_count_,
huge_page_tlb_size_);
return new HashLinkListRep(compare, arena, transform, bucket_count_);
}
MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count,
size_t huge_page_tlb_size) {
return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size);
MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) {
return new HashLinkListRepFactory(bucket_count);
}
} // namespace rocksdb

@ -15,9 +15,8 @@ namespace rocksdb {
class HashLinkListRepFactory : public MemTableRepFactory {
public:
explicit HashLinkListRepFactory(size_t bucket_count,
size_t huge_page_tlb_size)
: bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {}
explicit HashLinkListRepFactory(size_t bucket_count)
: bucket_count_(bucket_count) { }
virtual ~HashLinkListRepFactory() {}
@ -31,7 +30,6 @@ class HashLinkListRepFactory : public MemTableRepFactory {
private:
const size_t bucket_count_;
const size_t huge_page_tlb_size_;
};
}

@ -34,7 +34,8 @@ ColumnFamilyOptions::ColumnFamilyOptions()
compaction_filter(nullptr),
compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
new DefaultCompactionFilterFactory())),
compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()),
compaction_filter_factory_v2(
new DefaultCompactionFilterFactoryV2()),
write_buffer_size(4 << 20),
max_write_buffer_number(2),
min_write_buffer_number_to_merge(1),
@ -80,7 +81,6 @@ ColumnFamilyOptions::ColumnFamilyOptions()
inplace_callback(nullptr),
memtable_prefix_bloom_bits(0),
memtable_prefix_bloom_probes(6),
memtable_prefix_bloom_huge_page_tlb_size(0),
bloom_locality(0),
max_successive_merges(0),
min_partial_merge_operands(2) {
@ -146,8 +146,6 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
inplace_callback(options.inplace_callback),
memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
memtable_prefix_bloom_huge_page_tlb_size(
options.memtable_prefix_bloom_huge_page_tlb_size),
bloom_locality(options.bloom_locality),
max_successive_merges(options.max_successive_merges),
min_partial_merge_operands(options.min_partial_merge_operands) {
@ -430,8 +428,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
memtable_prefix_bloom_bits);
Log(log, " Options.memtable_prefix_bloom_probes: %d",
memtable_prefix_bloom_probes);
Log(log, " Options.memtable_prefix_bloom_huge_page_tlb_size: %zu",
memtable_prefix_bloom_huge_page_tlb_size);
Log(log, " Options.bloom_locality: %d",
bloom_locality);
Log(log, " Options.max_successive_merges: %zd",

Loading…
Cancel
Save