From d69dc64be78a8da3ce661454655966d11ff61bb6 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Sun, 4 May 2014 08:37:09 -0700 Subject: [PATCH] Revert "Allow allocating dynamic bloom, plain table indexes and hash linked list from huge page TLB" This reverts commit 7dafa3a1d7e63d4cc4a7d717ab958efc616a2892. --- db/db_test.cc | 2 +- db/memtable.cc | 7 +- db/plain_table_db_test.cc | 631 +++++++++++++++++----------------- db/prefix_test.cc | 8 - include/rocksdb/memtablerep.h | 9 +- include/rocksdb/options.h | 8 - include/rocksdb/table.h | 17 +- table/plain_table_factory.cc | 14 +- table/plain_table_factory.h | 10 +- table/plain_table_reader.cc | 41 +-- table/plain_table_reader.h | 12 +- util/arena.cc | 30 +- util/arena.h | 17 +- util/dynamic_bloom.cc | 27 +- util/dynamic_bloom.h | 18 +- util/hash_linklist_rep.cc | 25 +- util/hash_linklist_rep.h | 6 +- util/options.cc | 8 +- 18 files changed, 374 insertions(+), 516 deletions(-) diff --git a/db/db_test.cc b/db/db_test.cc index 350160af6..5162cec99 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -481,7 +481,7 @@ class DBTest { break; case kHashLinkList: options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0)); + options.memtable_factory.reset(NewHashLinkListRepFactory(4)); break; case kHashCuckoo: options.memtable_factory.reset( diff --git a/db/memtable.cc b/db/memtable.cc index f95ad3c98..424efe845 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -52,10 +52,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) // gone wrong already. assert(!should_flush_); if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { - prefix_bloom_.reset(new DynamicBloom( - options.memtable_prefix_bloom_bits, options.bloom_locality, - options.memtable_prefix_bloom_probes, nullptr, - options.memtable_prefix_bloom_huge_page_tlb_size)); + prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits, + options.bloom_locality, + options.memtable_prefix_bloom_probes)); } } diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 17e3e61d8..517ef0a94 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -185,7 +185,7 @@ class TestPlainTableReader : public PlainTableReader { const Options& options, bool* expect_bloom_not_match) : PlainTableReader(options, std::move(file), storage_options, icomparator, file_size, bloom_bits_per_key, hash_table_ratio, - index_sparseness, table_properties, 2 * 1024 * 1024), + index_sparseness, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { Status s = PopulateIndex(const_cast(table_properties)); ASSERT_TRUE(s.ok()); @@ -206,12 +206,13 @@ extern const uint64_t kPlainTableMagicNumber; class TestPlainTableFactory : public PlainTableFactory { public: explicit TestPlainTableFactory(bool* expect_bloom_not_match, - uint32_t user_key_len, int bloom_bits_per_key, - double hash_table_ratio, - size_t index_sparseness, - size_t huge_page_tlb_size) + uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_key = 0, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16) : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio, - index_sparseness, huge_page_tlb_size), + hash_table_ratio), bloom_bits_per_key_(bloom_bits_per_key), hash_table_ratio_(hash_table_ratio), index_sparseness_(index_sparseness), @@ -243,209 +244,197 @@ class TestPlainTableFactory : public PlainTableFactory { }; TEST(PlainTableDBTest, Flush) { - for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; - huge_page_tlb_size += 2 * 1024 * 1024) { - for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { - for (int total_order = 0; total_order <= 1; total_order++) { - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - if (total_order) { - options.table_factory.reset(NewTotalOrderPlainTableFactory( - 16, bloom_bits, 2, huge_page_tlb_size)); - } else { - options.table_factory.reset(NewPlainTableFactory( - 16, bloom_bits, 0.75, 16, huge_page_tlb_size)); - } - DestroyAndReopen(&options); - - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); - - TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); - ASSERT_EQ(1U, ptc.size()); - auto row = ptc.begin(); - auto tp = row->second; - ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at( - "plain_table_hash_table_size")); - ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at( - "plain_table_sub_index_size")); - - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.table_factory.reset( + NewTotalOrderPlainTableFactory(16, bloom_bits, 2)); + } else { + options.table_factory.reset(NewPlainTableFactory(16, bloom_bits)); } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + + TablePropertiesCollection ptc; + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(1U, ptc.size()); + auto row = ptc.begin(); + auto tp = row->second; + ASSERT_EQ( + total_order ? "4" : "12", + (tp->user_collected_properties).at("plain_table_hash_table_size")); + ASSERT_EQ( + total_order ? "9" : "0", + (tp->user_collected_properties).at("plain_table_sub_index_size")); + + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); } } } TEST(PlainTableDBTest, Flush2) { - for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; - huge_page_tlb_size += 2 * 1024 * 1024) { - for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { - for (int total_order = 0; total_order <= 1; total_order++) { - bool expect_bloom_not_match = false; - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - if (total_order) { - options.prefix_extractor = nullptr; - options.table_factory.reset( - new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, - 0, 2, huge_page_tlb_size)); - } else { - options.table_factory.reset( - new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, - 0.75, 16, huge_page_tlb_size)); - } - DestroyAndReopen(&options); - ASSERT_OK(Put("0000000000000bar", "b")); - ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); - - ASSERT_OK(Put("1000000000000foo", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("1000000000000foo")); - - ASSERT_OK(Put("0000000000000eee", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("0000000000000eee")); - - ASSERT_OK(Delete("0000000000000bar")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); - - ASSERT_OK(Put("0000000000000eee", "v5")); - ASSERT_OK(Put("9000000000000eee", "v5")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v5", Get("0000000000000eee")); - - // Test Bloom Filter - if (bloom_bits > 0) { - // Neither key nor value should exist. - expect_bloom_not_match = true; - ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); - // Key doesn't exist any more but prefix exists. - if (total_order) { - ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); - ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); - } - expect_bloom_not_match = false; + ASSERT_OK(Put("1000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); } + expect_bloom_not_match = false; } } } } TEST(PlainTableDBTest, Iterator) { - for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; - huge_page_tlb_size += 2 * 1024 * 1024) { - for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { - for (int total_order = 0; total_order <= 1; total_order++) { - bool expect_bloom_not_match = false; - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - if (total_order) { - options.prefix_extractor = nullptr; - options.table_factory.reset( - new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, - 0, 2, huge_page_tlb_size)); - } else { - options.table_factory.reset( - new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, - 0.75, 16, huge_page_tlb_size)); - } - DestroyAndReopen(&options); - - ASSERT_OK(Put("1000000000foo002", "v_2")); - ASSERT_OK(Put("0000000000000bar", "random")); - ASSERT_OK(Put("1000000000foo001", "v1")); - ASSERT_OK(Put("3000000000000bar", "bar_v")); - ASSERT_OK(Put("1000000000foo003", "v__3")); - ASSERT_OK(Put("1000000000foo004", "v__4")); - ASSERT_OK(Put("1000000000foo005", "v__5")); - ASSERT_OK(Put("1000000000foo007", "v__7")); - ASSERT_OK(Put("1000000000foo008", "v__8")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v1", Get("1000000000foo001")); - ASSERT_EQ("v__3", Get("1000000000foo003")); - Iterator* iter = dbfull()->NewIterator(ReadOptions()); - iter->Seek("1000000000foo000"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo002", iter->key().ToString()); - ASSERT_EQ("v_2", iter->value().ToString()); + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo003", iter->key().ToString()); - ASSERT_EQ("v__3", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo004", iter->key().ToString()); - ASSERT_EQ("v__4", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); - iter->Seek("3000000000000bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("3000000000000bar", iter->key().ToString()); - ASSERT_EQ("bar_v", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); - iter->Seek("1000000000foo000"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); - iter->Seek("1000000000foo005"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo005", iter->key().ToString()); - ASSERT_EQ("v__5", iter->value().ToString()); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); - iter->Seek("1000000000foo006"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo007", iter->key().ToString()); - ASSERT_EQ("v__7", iter->value().ToString()); + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); - iter->Seek("1000000000foo008"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo008", iter->key().ToString()); - ASSERT_EQ("v__8", iter->value().ToString()); + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); - if (total_order == 0) { - iter->Seek("1000000000foo009"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("3000000000000bar", iter->key().ToString()); - } + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); - // Test Bloom Filter - if (bloom_bits > 0) { - if (!total_order) { - // Neither key nor value should exist. - expect_bloom_not_match = true; - iter->Seek("2not000000000bar"); - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); - expect_bloom_not_match = false; - } else { - expect_bloom_not_match = true; - ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); - expect_bloom_not_match = false; - } - } + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } - delete iter; + // Test Bloom Filter + if (bloom_bits > 0) { + if (!total_order) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } else { + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } } + + delete iter; } } } @@ -592,173 +581,165 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) { } TEST(PlainTableDBTest, HashBucketConflict) { - for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; - huge_page_tlb_size += 2 * 1024 * 1024) { - for (unsigned char i = 1; i <= 3; i++) { - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - options.table_factory.reset( - NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size)); - DestroyAndReopen(&options); - ASSERT_OK(Put("5000000000000fo0", "v1")); - ASSERT_OK(Put("5000000000000fo1", "v2")); - ASSERT_OK(Put("5000000000000fo2", "v")); - ASSERT_OK(Put("2000000000000fo0", "v3")); - ASSERT_OK(Put("2000000000000fo1", "v4")); - ASSERT_OK(Put("2000000000000fo2", "v")); - ASSERT_OK(Put("2000000000000fo3", "v")); - - dbfull()->TEST_FlushMemTable(); - - ASSERT_EQ("v1", Get("5000000000000fo0")); - ASSERT_EQ("v2", Get("5000000000000fo1")); - ASSERT_EQ("v3", Get("2000000000000fo0")); - ASSERT_EQ("v4", Get("2000000000000fo1")); - - ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); - ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); - ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); - ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); - - ReadOptions ro; - Iterator* iter = dbfull()->NewIterator(ro); - - iter->Seek("5000000000000fo0"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo0", iter->key().ToString()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); - iter->Seek("5000000000000fo1"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); - iter->Seek("2000000000000fo0"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo0", iter->key().ToString()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Seek("2000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); - iter->Seek("2000000000000fo1"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); - iter->Seek("2000000000000bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Seek("2000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); - iter->Seek("5000000000000bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); - iter->Seek("2000000000000fo8"); - ASSERT_TRUE(!iter->Valid() || - options.comparator->Compare(iter->key(), "20000001") > 0); + iter->Seek("2000000000000fo8"); + ASSERT_TRUE(!iter->Valid() || + options.comparator->Compare(iter->key(), "20000001") > 0); - iter->Seek("5000000000000fo8"); - ASSERT_TRUE(!iter->Valid()); + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); - iter->Seek("1000000000000fo2"); - ASSERT_TRUE(!iter->Valid()); + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); - iter->Seek("3000000000000fo2"); - ASSERT_TRUE(!iter->Valid()); + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); - iter->Seek("8000000000000fo2"); - ASSERT_TRUE(!iter->Valid()); + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); - delete iter; - } + delete iter; } } TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { - for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; - huge_page_tlb_size += 2 * 1024 * 1024) { - for (unsigned char i = 1; i <= 3; i++) { - Options options = CurrentOptions(); - options.create_if_missing = true; - SimpleSuffixReverseComparator comp; - options.comparator = ∁ - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - options.table_factory.reset( - NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size)); - DestroyAndReopen(&options); - ASSERT_OK(Put("5000000000000fo0", "v1")); - ASSERT_OK(Put("5000000000000fo1", "v2")); - ASSERT_OK(Put("5000000000000fo2", "v")); - ASSERT_OK(Put("2000000000000fo0", "v3")); - ASSERT_OK(Put("2000000000000fo1", "v4")); - ASSERT_OK(Put("2000000000000fo2", "v")); - ASSERT_OK(Put("2000000000000fo3", "v")); - - dbfull()->TEST_FlushMemTable(); - - ASSERT_EQ("v1", Get("5000000000000fo0")); - ASSERT_EQ("v2", Get("5000000000000fo1")); - ASSERT_EQ("v3", Get("2000000000000fo0")); - ASSERT_EQ("v4", Get("2000000000000fo1")); - - ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); - ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); - ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); - ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); - - ReadOptions ro; - Iterator* iter = dbfull()->NewIterator(ro); - - iter->Seek("5000000000000fo1"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo1", iter->key().ToString()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); - iter->Seek("5000000000000fo1"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); - iter->Seek("2000000000000fo1"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo1", iter->key().ToString()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); - iter->Seek("2000000000000fo1"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); - iter->Seek("2000000000000var"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("2000000000000fo3", iter->key().ToString()); + iter->Seek("2000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo3", iter->key().ToString()); - iter->Seek("5000000000000var"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("5000000000000fo2", iter->key().ToString()); + iter->Seek("5000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo2", iter->key().ToString()); - std::string seek_key = "2000000000000bar"; - iter->Seek(seek_key); - ASSERT_TRUE(!iter->Valid() || - options.prefix_extractor->Transform(iter->key()) != - options.prefix_extractor->Transform(seek_key)); + std::string seek_key = "2000000000000bar"; + iter->Seek(seek_key); + ASSERT_TRUE(!iter->Valid() || + options.prefix_extractor->Transform(iter->key()) != + options.prefix_extractor->Transform(seek_key)); - iter->Seek("1000000000000fo2"); - ASSERT_TRUE(!iter->Valid()); + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); - iter->Seek("3000000000000fo2"); - ASSERT_TRUE(!iter->Valid()); + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); - iter->Seek("8000000000000fo2"); - ASSERT_TRUE(!iter->Valid()); + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); - delete iter; - } + delete iter; } } diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 3a88fc8ce..18036bb93 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -30,7 +30,6 @@ DEFINE_int64(min_write_buffer_number_to_merge, 1, ""); DEFINE_int32(skiplist_height, 4, ""); DEFINE_int32(memtable_prefix_bloom_bits, 10000000, ""); DEFINE_int32(memtable_prefix_bloom_probes, 10, ""); -DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, ""); DEFINE_int32(value_size, 40, ""); // Path to the database on file system @@ -149,8 +148,6 @@ class PrefixTest { options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits; options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes; - options.memtable_prefix_bloom_huge_page_tlb_size = - FLAGS_memtable_prefix_bloom_huge_page_tlb_size; Status s = DB::Open(options, kDbName, &db); ASSERT_OK(s); @@ -175,10 +172,6 @@ class PrefixTest { options.memtable_factory.reset( NewHashLinkListRepFactory(bucket_count)); return true; - case kHashLinkListHugePageTlb: - options.memtable_factory.reset( - NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024)); - return true; default: return false; } @@ -197,7 +190,6 @@ class PrefixTest { kBegin, kHashSkipList, kHashLinkList, - kHashLinkListHugePageTlb, kEnd }; int option_config_; diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 7ab9a45de..445edccac 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -223,14 +223,9 @@ extern MemTableRepFactory* NewHashSkipListRepFactory( // The factory is to create memtables with a hashed linked list: // it contains a fixed array of buckets, each pointing to a sorted single // linked list (null if the bucket is empty). -// @bucket_count: number of fixed array buckets -// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. -// Otherwise from huge page TLB. The user needs to reserve -// huge pages for it to be allocated, like: -// sysctl -w vm.nr_hugepages=20 -// See linux doc Documentation/vm/hugetlbpage.txt +// bucket_count: number of fixed array buckets extern MemTableRepFactory* NewHashLinkListRepFactory( - size_t bucket_count = 50000, size_t huge_page_tlb_size = 2 * 1024 * 1024); + size_t bucket_count = 50000); // This factory creates a cuckoo-hashing based mem-table representation. // Cuckoo-hash is a closed-hash strategy, in which all key/value pairs diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 93dbf0d88..c283a5e53 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -498,14 +498,6 @@ struct ColumnFamilyOptions { // number of hash probes per key uint32_t memtable_prefix_bloom_probes; - // Page size for huge page TLB for bloom in memtable. If <=0, not allocate - // from huge page TLB but from malloc. - // Need to reserve huge pages for it to be allocated. For example: - // sysctl -w vm.nr_hugepages=20 - // See linux doc Documentation/vm/hugetlbpage.txt - - size_t memtable_prefix_bloom_huge_page_tlb_size; - // Control locality of bloom filter probes to improve cache miss rate. // This option only applies to memtable prefix bloom and plaintable // prefix bloom. It essentially limits the max number of cache lines each diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 11adfec8c..14a505a6f 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -107,19 +107,12 @@ extern TableFactory* NewBlockBasedTableFactory( // in the hash table // @index_sparseness: inside each prefix, need to build one index record for how // many keys for binary search inside each hash bucket. -// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. -// Otherwise from huge page TLB. The user needs to reserve -// huge pages for it to be allocated, like: -// sysctl -w vm.nr_hugepages=20 -// See linux doc Documentation/vm/hugetlbpage.txt - const uint32_t kPlainTableVariableLength = 0; extern TableFactory* NewPlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, int bloom_bits_per_prefix = 10, double hash_table_ratio = 0.75, - size_t index_sparseness = 16, - size_t huge_page_tlb_size = 0); + size_t index_sparseness = 16); // -- Plain Table // This factory of plain table ignores Options.prefix_extractor and assumes no @@ -133,15 +126,9 @@ extern TableFactory* NewPlainTableFactory(uint32_t user_key_len = // disable it by passing a zero. // @index_sparseness: need to build one index record for how many keys for // binary search. -// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. -// Otherwise from huge page TLB. The user needs to reserve -// huge pages for it to be allocated, like: -// sysctl -w vm.nr_hugepages=20 -// See linux doc Documentation/vm/hugetlbpage.txt extern TableFactory* NewTotalOrderPlainTableFactory( uint32_t user_key_len = kPlainTableVariableLength, - int bloom_bits_per_key = 0, size_t index_sparseness = 16, - size_t huge_page_tlb_size = 0); + int bloom_bits_per_key = 0, size_t index_sparseness = 16); #endif // ROCKSDB_LITE diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index f9d88e9ef..4e844687d 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -22,8 +22,7 @@ Status PlainTableFactory::NewTableReader(const Options& options, unique_ptr* table) const { return PlainTableReader::Open(options, soptions, icomp, std::move(file), file_size, table, bloom_bits_per_key_, - hash_table_ratio_, index_sparseness_, - huge_page_tlb_size_); + hash_table_ratio_, index_sparseness_); } TableBuilder* PlainTableFactory::NewTableBuilder( @@ -35,19 +34,16 @@ TableBuilder* PlainTableFactory::NewTableBuilder( extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, int bloom_bits_per_key, double hash_table_ratio, - size_t index_sparseness, - size_t huge_page_tlb_size) { + size_t index_sparseness) { return new PlainTableFactory(user_key_len, bloom_bits_per_key, - hash_table_ratio, index_sparseness, - huge_page_tlb_size); + hash_table_ratio, index_sparseness); } extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len, int bloom_bits_per_key, - size_t index_sparseness, - size_t huge_page_tlb_size) { + size_t index_sparseness) { return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0, - index_sparseness, huge_page_tlb_size); + index_sparseness); } } // namespace rocksdb diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index 77d24f711..84af22fb9 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -56,19 +56,14 @@ class PlainTableFactory : public TableFactory { // inside the same prefix. It will be the maximum number of linear search // required after hash and binary search. // index_sparseness = 0 means index for every key. - // huge_page_tlb_size determines whether to allocate hash indexes from huge - // page TLB and the page size if allocating from there. See comments of - // Arena::AllocateAligned() for details. explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, int bloom_bits_per_key = 0, double hash_table_ratio = 0.75, - size_t index_sparseness = 16, - size_t huge_page_tlb_size = 2 * 1024 * 1024) + size_t index_sparseness = 16) : user_key_len_(user_key_len), bloom_bits_per_key_(bloom_bits_per_key), hash_table_ratio_(hash_table_ratio), - index_sparseness_(index_sparseness), - huge_page_tlb_size_(huge_page_tlb_size) {} + index_sparseness_(index_sparseness) {} const char* Name() const override { return "PlainTable"; } Status NewTableReader(const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, @@ -87,7 +82,6 @@ class PlainTableFactory : public TableFactory { int bloom_bits_per_key_; double hash_table_ratio_; size_t index_sparseness_; - size_t huge_page_tlb_size_; }; } // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index f1cb3db47..196201730 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -24,7 +24,6 @@ #include "table/two_level_iterator.h" #include "table/plain_table_factory.h" -#include "util/arena.h" #include "util/coding.h" #include "util/dynamic_bloom.h" #include "util/hash.h" @@ -96,8 +95,7 @@ PlainTableReader::PlainTableReader( const Options& options, unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, - size_t index_sparseness, const TableProperties* table_properties, - size_t huge_page_tlb_size) + size_t index_sparseness, const TableProperties* table_properties) : options_(options), soptions_(storage_options), file_(std::move(file)), @@ -108,23 +106,19 @@ PlainTableReader::PlainTableReader( kIndexIntervalForSamePrefixKeys(index_sparseness), table_properties_(nullptr), data_end_offset_(table_properties->data_size), - user_key_len_(table_properties->fixed_key_len), - huge_page_tlb_size_(huge_page_tlb_size) { + user_key_len_(table_properties->fixed_key_len) { assert(kHashTableRatio >= 0.0); } PlainTableReader::~PlainTableReader() { } -Status PlainTableReader::Open(const Options& options, - const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table_reader, - const int bloom_bits_per_key, - double hash_table_ratio, size_t index_sparseness, - size_t huge_page_tlb_size) { +Status PlainTableReader::Open( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader, const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { assert(options.allow_mmap_reads); if (file_size > kMaxFileSize) { @@ -140,8 +134,7 @@ Status PlainTableReader::Open(const Options& options, std::unique_ptr new_reader(new PlainTableReader( options, std::move(file), soptions, internal_comparator, file_size, - bloom_bits_per_key, hash_table_ratio, index_sparseness, props, - huge_page_tlb_size)); + bloom_bits_per_key, hash_table_ratio, index_sparseness, props)); // -- Populate Index s = new_reader->PopulateIndex(props); @@ -268,11 +261,12 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, } void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { + index_.reset(); + if (options_.prefix_extractor.get() != nullptr) { uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; if (bloom_total_bits > 0) { - bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality, - 6, nullptr, huge_page_tlb_size_)); + bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality)); } } @@ -284,6 +278,7 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { double hash_table_size_multipier = 1.0 / kHashTableRatio; index_size_ = num_prefixes * hash_table_size_multipier + 1; } + index_.reset(new uint32_t[index_size_]); } size_t PlainTableReader::BucketizeIndexesAndFillBloom( @@ -327,12 +322,7 @@ void PlainTableReader::FillIndexes( const std::vector& entries_per_bucket) { Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", kSubIndexSize); - auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize; - char* allocated = - arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_); - index_ = reinterpret_cast(allocated); - sub_index_ = allocated + sizeof(uint32_t) * index_size_; - + sub_index_.reset(new char[kSubIndexSize]); size_t sub_index_offset = 0; for (int i = 0; i < index_size_; i++) { uint32_t num_keys_for_bucket = entries_per_bucket[i]; @@ -397,8 +387,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) { if (IsTotalOrderMode()) { uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; if (num_bloom_bits > 0) { - bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6, - nullptr, huge_page_tlb_size_)); + bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality)); } } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index e6373dc82..756439b5c 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -19,7 +19,6 @@ #include "rocksdb/table_properties.h" #include "table/table_reader.h" #include "table/plain_table_factory.h" -#include "util/arena.h" namespace rocksdb { @@ -53,7 +52,7 @@ class PlainTableReader: public TableReader { unique_ptr&& file, uint64_t file_size, unique_ptr* table, const int bloom_bits_per_key, double hash_table_ratio, - size_t index_sparseness, size_t huge_page_tlb_size); + size_t index_sparseness); Iterator* NewIterator(const ReadOptions&); @@ -75,8 +74,7 @@ class PlainTableReader: public TableReader { const InternalKeyComparator& internal_comparator, uint64_t file_size, int bloom_num_bits, double hash_table_ratio, size_t index_sparseness, - const TableProperties* table_properties, - size_t huge_page_tlb_size); + const TableProperties* table_properties); virtual ~PlainTableReader(); protected: @@ -138,9 +136,9 @@ class PlainTableReader: public TableReader { // For more details about the in-memory index, please refer to: // https://github.com/facebook/rocksdb/wiki/PlainTable-Format // #wiki-in-memory-index-format - uint32_t* index_; + std::unique_ptr index_; int index_size_ = 0; - char* sub_index_; + std::unique_ptr sub_index_; Options options_; const EnvOptions& soptions_; @@ -161,7 +159,6 @@ class PlainTableReader: public TableReader { const size_t kIndexIntervalForSamePrefixKeys = 16; // Bloom filter is used to rule out non-existent key unique_ptr bloom_; - Arena arena_; std::shared_ptr table_properties_; // data_start_offset_ and data_end_offset_ defines the range of the @@ -169,7 +166,6 @@ class PlainTableReader: public TableReader { const uint32_t data_start_offset_ = 0; const uint32_t data_end_offset_; const size_t user_key_len_; - const size_t huge_page_tlb_size_; static const size_t kNumInternalBytes = 8; static const uint32_t kSubIndexMask = 0x80000000; diff --git a/util/arena.cc b/util/arena.cc index 3575f2d90..9b2cb82d1 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "util/arena.h" -#include #include namespace rocksdb { @@ -39,13 +38,6 @@ Arena::~Arena() { for (const auto& block : blocks_) { delete[] block; } - for (const auto& mmap_info : huge_blocks_) { - auto ret = munmap(mmap_info.addr_, mmap_info.length_); - if (ret != 0) { - // TODO(sdong): Better handling - perror("munmap"); - } - } } char* Arena::AllocateFallback(size_t bytes, bool aligned) { @@ -71,29 +63,9 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { } } -char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) { +char* Arena::AllocateAligned(size_t bytes) { assert((kAlignUnit & (kAlignUnit - 1)) == 0); // Pointer size should be a power of 2 - -#ifdef OS_LINUX - if (huge_page_tlb_size > 0 && bytes > 0) { - // Allocate from a huge page TBL table. - size_t reserved_size = - ((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size; - assert(reserved_size >= bytes); - void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE), - (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - // fail back to malloc - } else { - blocks_memory_ += reserved_size; - huge_blocks_.push_back(MmapInfo(addr, reserved_size)); - return reinterpret_cast(addr); - } - } -#endif - size_t current_mod = reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); diff --git a/util/arena.h b/util/arena.h index a4dff495b..6ce5a438d 100644 --- a/util/arena.h +++ b/util/arena.h @@ -34,14 +34,7 @@ class Arena { char* Allocate(size_t bytes); - // huge_page_tlb_size: if >0, allocate bytes from huge page TLB and the size - // of the huge page TLB. Bytes will be rounded up to multiple and 2MB and - // allocate huge pages through mmap anonymous option with huge page on. - // The extra space allocated will be wasted. To enable it, need to reserve - // huge pages for it to be allocated, like: - // sysctl -w vm.nr_hugepages=20 - // See linux doc Documentation/vm/hugetlbpage.txt for details. - char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0); + char* AllocateAligned(size_t bytes); // Returns an estimate of the total memory usage of data allocated // by the arena (exclude the space allocated but not yet used for future @@ -67,14 +60,6 @@ class Arena { // Array of new[] allocated memory blocks typedef std::vector Blocks; Blocks blocks_; - - struct MmapInfo { - void* addr_; - size_t length_; - - MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {} - }; - std::vector huge_blocks_; size_t irregular_block_num = 0; // Stats for current active block. diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index bc48b9fd3..a4c8e11cb 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -19,19 +19,18 @@ static uint32_t BloomHash(const Slice& key) { } } -DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block, +DynamicBloom::DynamicBloom(uint32_t total_bits, + uint32_t cl_per_block, uint32_t num_probes, - uint32_t (*hash_func)(const Slice& key), - size_t huge_page_tlb_size) - : kBlocked(cl_per_block > 0), - kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8), - kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock * - kBitsPerBlock - : total_bits + 7) / - 8 * 8), - kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1), - kNumProbes(num_probes), - hash_func_(hash_func == nullptr ? &BloomHash : hash_func) { + uint32_t (*hash_func)(const Slice& key)) + : kBlocked(cl_per_block > 0), + kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8), + kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock + * kBitsPerBlock : + total_bits + 7) / 8 * 8), + kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1), + kNumProbes(num_probes), + hash_func_(hash_func == nullptr ? &BloomHash : hash_func) { assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock); assert(kNumProbes > 0); @@ -39,9 +38,7 @@ DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block, if (kBlocked) { sz += CACHE_LINE_SIZE - 1; } - raw_ = reinterpret_cast( - arena_.AllocateAligned(sz, huge_page_tlb_size)); - memset(raw_, 0, sz); + raw_ = new unsigned char[sz](); if (kBlocked && (reinterpret_cast(raw_) % CACHE_LINE_SIZE)) { data_ = raw_ + CACHE_LINE_SIZE - reinterpret_cast(raw_) % CACHE_LINE_SIZE; diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index f91bb8f91..efc461cf9 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -8,8 +8,6 @@ #include #include -#include - namespace rocksdb { class Slice; @@ -21,17 +19,13 @@ class DynamicBloom { // cl_per_block: block size in cache lines. When this is non-zero, a // query/set is done within a block to improve cache locality. // hash_func: customized hash function - // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB - // withi this page size. Need to reserve huge pages for - // it to be allocated, like: - // sysctl -w vm.nr_hugepages=20 - // See linux doc Documentation/vm/hugetlbpage.txt explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0, - uint32_t num_probes = 6, - uint32_t (*hash_func)(const Slice& key) = nullptr, - size_t huge_page_tlb_size = 0); + uint32_t num_probes = 6, + uint32_t (*hash_func)(const Slice& key) = nullptr); - ~DynamicBloom() {} + ~DynamicBloom() { + delete[] raw_; + } // Assuming single threaded access to this function. void Add(const Slice& key); @@ -55,8 +49,6 @@ class DynamicBloom { uint32_t (*hash_func_)(const Slice& key); unsigned char* data_; unsigned char* raw_; - - Arena arena_; }; inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); } diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc index acd78c5bb..64aa2d9e8 100644 --- a/util/hash_linklist_rep.cc +++ b/util/hash_linklist_rep.cc @@ -53,8 +53,7 @@ struct Node { class HashLinkListRep : public MemTableRep { public: HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform, size_t bucket_size, - size_t huge_page_tlb_size); + const SliceTransform* transform, size_t bucket_size); virtual KeyHandle Allocate(const size_t len, char** buf) override; @@ -307,13 +306,13 @@ class HashLinkListRep : public MemTableRep { HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, - size_t bucket_size, size_t huge_page_tlb_size) - : MemTableRep(arena), - bucket_size_(bucket_size), - transform_(transform), - compare_(compare) { - char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size, - huge_page_tlb_size); + size_t bucket_size) + : MemTableRep(arena), + bucket_size_(bucket_size), + transform_(transform), + compare_(compare) { + char* mem = arena_->AllocateAligned( + sizeof(port::AtomicPointer) * bucket_size); buckets_ = new (mem) port::AtomicPointer[bucket_size]; @@ -470,13 +469,11 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, MemTableRep* HashLinkListRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform) { - return new HashLinkListRep(compare, arena, transform, bucket_count_, - huge_page_tlb_size_); + return new HashLinkListRep(compare, arena, transform, bucket_count_); } -MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count, - size_t huge_page_tlb_size) { - return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size); +MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) { + return new HashLinkListRepFactory(bucket_count); } } // namespace rocksdb diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h index 4a9fd0009..f1ab5d560 100644 --- a/util/hash_linklist_rep.h +++ b/util/hash_linklist_rep.h @@ -15,9 +15,8 @@ namespace rocksdb { class HashLinkListRepFactory : public MemTableRepFactory { public: - explicit HashLinkListRepFactory(size_t bucket_count, - size_t huge_page_tlb_size) - : bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {} + explicit HashLinkListRepFactory(size_t bucket_count) + : bucket_count_(bucket_count) { } virtual ~HashLinkListRepFactory() {} @@ -31,7 +30,6 @@ class HashLinkListRepFactory : public MemTableRepFactory { private: const size_t bucket_count_; - const size_t huge_page_tlb_size_; }; } diff --git a/util/options.cc b/util/options.cc index c8d1e3889..e33d44ebe 100644 --- a/util/options.cc +++ b/util/options.cc @@ -34,7 +34,8 @@ ColumnFamilyOptions::ColumnFamilyOptions() compaction_filter(nullptr), compaction_filter_factory(std::shared_ptr( new DefaultCompactionFilterFactory())), - compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()), + compaction_filter_factory_v2( + new DefaultCompactionFilterFactoryV2()), write_buffer_size(4 << 20), max_write_buffer_number(2), min_write_buffer_number_to_merge(1), @@ -80,7 +81,6 @@ ColumnFamilyOptions::ColumnFamilyOptions() inplace_callback(nullptr), memtable_prefix_bloom_bits(0), memtable_prefix_bloom_probes(6), - memtable_prefix_bloom_huge_page_tlb_size(0), bloom_locality(0), max_successive_merges(0), min_partial_merge_operands(2) { @@ -146,8 +146,6 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) inplace_callback(options.inplace_callback), memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), - memtable_prefix_bloom_huge_page_tlb_size( - options.memtable_prefix_bloom_huge_page_tlb_size), bloom_locality(options.bloom_locality), max_successive_merges(options.max_successive_merges), min_partial_merge_operands(options.min_partial_merge_operands) { @@ -430,8 +428,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const { memtable_prefix_bloom_bits); Log(log, " Options.memtable_prefix_bloom_probes: %d", memtable_prefix_bloom_probes); - Log(log, " Options.memtable_prefix_bloom_huge_page_tlb_size: %zu", - memtable_prefix_bloom_huge_page_tlb_size); Log(log, " Options.bloom_locality: %d", bloom_locality); Log(log, " Options.max_successive_merges: %zd",