From f3ae3d07cc10b7ad7df293778b3acbb924462505 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Fri, 7 Feb 2014 16:25:38 -0800 Subject: [PATCH] Add more black-box tests for PlainTable and explicitly support total order mode Summary: 1. Add some more implementation-aware tests for PlainTable 2. move from a hard-coded one index per 16 rows in one prefix to a configurable number. Also, make hash table ratio = 0 means binary search only. Also fixes some divide 0 risks. 3. Explicitly support total order (only use binary search) 4. some code cleaning up. Test Plan: make all check Reviewers: haobo, kailiu Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D16023 --- db/db_impl.cc | 5 +- db/plain_table_db_test.cc | 548 +++++++++++++++++++++++++++++++---- include/rocksdb/table.h | 34 ++- table/plain_table_factory.cc | 14 +- table/plain_table_factory.h | 13 +- table/plain_table_reader.cc | 197 +++++++++---- table/plain_table_reader.h | 148 ++++++---- table/table_test.cc | 38 ++- 8 files changed, 805 insertions(+), 192 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index df7451b8e..2d987941f 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2754,7 +2754,10 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, Iterator* DBImpl::TEST_NewInternalIterator() { SequenceNumber ignored; - return NewInternalIterator(ReadOptions(), &ignored); + ReadOptions read_options; + // Use prefix_seek to make the test function more useful. + read_options.prefix_seek = true; + return NewInternalIterator(read_options, &ignored); } std::pair DBImpl::GetTailingIteratorPair( diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 0d554278c..c195253b0 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -22,7 +22,9 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "table/meta_blocks.h" #include "table/plain_table_factory.h" +#include "table/plain_table_reader.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" @@ -46,6 +48,7 @@ class PlainTableDBTest { public: PlainTableDBTest() : env_(Env::Default()) { + ro_.prefix_seek = true; dbname_ = test::TmpDir() + "/plain_table_db_test"; ASSERT_OK(DestroyDB(dbname_, Options())); db_ = nullptr; @@ -57,10 +60,12 @@ class PlainTableDBTest { ASSERT_OK(DestroyDB(dbname_, Options())); } + ReadOptions ro_; + // Return the current option configuration. Options CurrentOptions() { Options options; - options.table_factory.reset(new PlainTableFactory(16, 2, 0.8)); + options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3)); options.prefix_extractor = prefix_transform.get(); options.allow_mmap_reads = true; return options; @@ -119,7 +124,7 @@ class PlainTableDBTest { } std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { - ReadOptions options; + ReadOptions options = ro_; options.snapshot = snapshot; std::string result; Status s = db_->Get(options, k, &result); @@ -176,25 +181,298 @@ TEST(PlainTableDBTest, Empty) { ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); } -TEST(PlainTableDBTest, ReadWrite) { - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_EQ("v1", Get("1000000000000foo")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("1000000000000foo", "v3")); - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); -} +class TestPlainTableReader : public PlainTableReader { + public: + TestPlainTableReader(const EnvOptions& storage_options, + const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + const TableProperties* table_properties, + unique_ptr&& file, + const Options& options, bool* expect_bloom_not_match) + : PlainTableReader(storage_options, icomparator, file_size, + bloom_bits_per_key, hash_table_ratio, index_sparseness, + table_properties), + expect_bloom_not_match_(expect_bloom_not_match) { + file_ = std::move(file); + options_ = options; + Status s = PopulateIndex(); + ASSERT_TRUE(s.ok()); + } + + virtual ~TestPlainTableReader() {} + + private: + virtual bool MatchBloom(uint32_t hash) const override { + bool ret = PlainTableReader::MatchBloom(hash); + ASSERT_TRUE(!*expect_bloom_not_match_ || !ret); + return ret; + } + bool* expect_bloom_not_match_; +}; + +extern const uint64_t kPlainTableMagicNumber; +class TestPlainTableFactory : public PlainTableFactory { + public: + explicit TestPlainTableFactory(bool* expect_bloom_not_match, + uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_key = 0, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16) + : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio, + hash_table_ratio), + user_key_len_(user_key_len), + bloom_bits_per_key_(bloom_bits_per_key), + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness), + expect_bloom_not_match_(expect_bloom_not_match) {} + + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override { + TableProperties* props = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + options.env, options.info_log.get(), &props); + ASSERT_TRUE(s.ok()); + + std::unique_ptr new_reader(new TestPlainTableReader( + soptions, internal_comparator, file_size, bloom_bits_per_key_, + hash_table_ratio_, index_sparseness_, props, std::move(file), options, + expect_bloom_not_match_)); + + *table = std::move(new_reader); + return s; + } + + private: + uint32_t user_key_len_; + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; + bool* expect_bloom_not_match_; +}; TEST(PlainTableDBTest, Flush) { - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); + for (int bloom_bits = 0; bloom_bits <= 8; bloom_bits += 8) { + for (int total_order = 0; total_order <= 1; total_order++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.table_factory.reset( + NewTotalOrderPlainTableFactory(16, bloom_bits, 2)); + } else { + options.table_factory.reset(NewPlainTableFactory(16, bloom_bits)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + } + } +} + +TEST(PlainTableDBTest, Flush2) { + for (int bloom_bits = 0; bloom_bits <= 10; bloom_bits += 10) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("1000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + } + expect_bloom_not_match = false; + } + } + } } TEST(PlainTableDBTest, Iterator) { + for (int bloom_bits = 0; bloom_bits <= 8; bloom_bits += 8) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + + // Key doesn't exist any more but prefix exists. + if (total_order) { + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + } + expect_bloom_not_match = false; + } + + delete iter; + } + } +} + +// A test comparator which compare two strings in this way: +// (1) first compare prefix of 8 bytes in alphabet order, +// (2) if two strings share the same prefix, sort the other part of the string +// in the reverse alphabet order. +class SimpleSuffixReverseComparator : public Comparator { + public: + SimpleSuffixReverseComparator() {} + + virtual const char* Name() const { return "SimpleSuffixReverseComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + Slice prefix_a = Slice(a.data(), 8); + Slice prefix_b = Slice(b.data(), 8); + int prefix_comp = prefix_a.compare(prefix_b); + if (prefix_comp != 0) { + return prefix_comp; + } else { + Slice suffix_a = Slice(a.data() + 8, a.size() - 8); + Slice suffix_b = Slice(b.data() + 8, b.size() - 8); + return -(suffix_a.compare(suffix_b)); + } + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; + +TEST(PlainTableDBTest, IteratorReverseSuffixComparator) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + DestroyAndReopen(&options); + ASSERT_OK(Put("1000000000foo002", "v_2")); ASSERT_OK(Put("0000000000000bar", "random")); ASSERT_OK(Put("1000000000foo001", "v1")); @@ -207,22 +485,21 @@ TEST(PlainTableDBTest, Iterator) { dbfull()->TEST_FlushMemTable(); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); - ReadOptions ro; - Iterator* iter = dbfull()->NewIterator(ro); - iter->Seek("1000000000foo001"); + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek("1000000000foo009"); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); iter->Next(); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo002", iter->key().ToString()); - ASSERT_EQ("v_2", iter->value().ToString()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); iter->Next(); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo003", iter->key().ToString()); - ASSERT_EQ("v__3", iter->value().ToString()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); iter->Next(); ASSERT_TRUE(iter->Valid()); @@ -234,11 +511,6 @@ TEST(PlainTableDBTest, Iterator) { ASSERT_EQ("3000000000000bar", iter->key().ToString()); ASSERT_EQ("bar_v", iter->value().ToString()); - iter->Seek("1000000000foo000"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); - iter->Seek("1000000000foo005"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("1000000000foo005", iter->key().ToString()); @@ -246,42 +518,220 @@ TEST(PlainTableDBTest, Iterator) { iter->Seek("1000000000foo006"); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo007", iter->key().ToString()); - ASSERT_EQ("v__7", iter->value().ToString()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); iter->Seek("1000000000foo008"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("1000000000foo008", iter->key().ToString()); ASSERT_EQ("v__8", iter->value().ToString()); - iter->Seek("1000000000foo009"); + iter->Seek("1000000000foo000"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("3000000000000bar", iter->key().ToString()); - delete iter; } -TEST(PlainTableDBTest, Flush2) { - ASSERT_OK(Put("0000000000000bar", "b")); - ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); +TEST(PlainTableDBTest, HashBucketConflict) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo8"); + ASSERT_TRUE(!iter->Valid() || + options.comparator->Compare(iter->key(), "20000001") > 0); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } +} - ASSERT_OK(Put("1000000000000foo", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("1000000000000foo")); +TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo3", iter->key().ToString()); + + iter->Seek("5000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo2", iter->key().ToString()); + + std::string seek_key = "2000000000000bar"; + iter->Seek(seek_key); + ASSERT_TRUE(!iter->Valid() || + options.prefix_extractor->Transform(iter->key()) != + options.prefix_extractor->Transform(seek_key)); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } +} - ASSERT_OK(Put("0000000000000eee", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("0000000000000eee")); +TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v3")); - ASSERT_OK(Delete("0000000000000bar")); dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); - ASSERT_OK(Put("0000000000000eee", "v5")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v5", Get("0000000000000eee")); + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("5000000000000fo2")); + + ASSERT_EQ("NOT_FOUND", Get("8000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + + Iterator* iter = dbfull()->NewIterator(ro_); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; } static std::string Key(int i) { diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index d4965ca45..5c04257ff 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -60,20 +60,44 @@ struct BlockBasedTableOptions { extern TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); -// -- Plain Table +// -- Plain Table with prefix-only seek +// For this factory, you need to set Options.prefix_extrator properly to make it +// work. Look-up will starts with prefix hash lookup for key prefix. Inside the +// hash bucket found, a binary search is executed for hash conflicts. Finally, +// a linear search is used. // @user_key_len: plain table has optimization for fix-sized keys, which can be // specified via user_key_len. Alternatively, you can pass // `kPlainTableVariableLength` if your keys have variable // lengths. -// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may -// disable it by passing a zero. +// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You +// may disable it by passing a zero. // @hash_table_ratio: the desired utilization of the hash table used for prefix // hashing. hash_table_ratio = number of prefixes / #buckets // in the hash table +// @index_sparseness: inside each prefix, need to build one index record for how +// many keys for binary search inside each hash bucket. const uint32_t kPlainTableVariableLength = 0; -extern TableFactory* NewPlainTableFactory( +extern TableFactory* NewPlainTableFactory(uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_prefix = 10, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16); + +// -- Plain Table +// This factory of plain table ignores Options.prefix_extractor and assumes no +// hashable prefix available to the key structure. Lookup will be based on +// binary search index only. Total order seek() can be issued. +// @user_key_len: plain table has optimization for fix-sized keys, which can be +// specified via user_key_len. Alternatively, you can pass +// `kPlainTableVariableLength` if your keys have variable +// lengths. +// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may +// disable it by passing a zero. +// @index_sparseness: need to build one index record for how many keys for +// binary search. +extern TableFactory* NewTotalOrderPlainTableFactory( uint32_t user_key_len = kPlainTableVariableLength, - int bloom_bits_per_key = 10, double hash_table_ratio = 0.75); + int bloom_bits_per_key = 0, size_t index_sparseness = 16); // A base class for table factories. class TableFactory { diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index c7ee8eb2f..16ee24eb4 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -21,7 +21,7 @@ Status PlainTableFactory::NewTableReader(const Options& options, unique_ptr* table) const { return PlainTableReader::Open(options, soptions, icomp, std::move(file), file_size, table, bloom_bits_per_key_, - hash_table_ratio_); + hash_table_ratio_, index_sparseness_); } TableBuilder* PlainTableFactory::NewTableBuilder( @@ -32,9 +32,17 @@ TableBuilder* PlainTableFactory::NewTableBuilder( extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, int bloom_bits_per_key, - double hash_table_ratio) { + double hash_table_ratio, + size_t index_sparseness) { return new PlainTableFactory(user_key_len, bloom_bits_per_key, - hash_table_ratio); + hash_table_ratio, index_sparseness); +} + +extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len, + int bloom_bits_per_key, + size_t index_sparseness) { + return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0, + index_sparseness); } } // namespace rocksdb diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index 382efe3c1..a0a7fbe6f 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -48,12 +48,20 @@ class PlainTableFactory : public TableFactory { // number of bits used for bloom filer per key. hash_table_ratio is // the desired utilization of the hash table used for prefix hashing. // hash_table_ratio = number of prefixes / #buckets in the hash table + // hash_table_ratio = 0 means skip hash table but only replying on binary + // search. + // index_sparseness determines index interval for keys + // inside the same prefix. It will be the maximum number of linear search + // required after hash and binary search. + // index_sparseness = 0 means index for every key. explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, int bloom_bits_per_key = 0, - double hash_table_ratio = 0.75) + double hash_table_ratio = 0.75, + size_t index_sparseness = 16) : user_key_len_(user_key_len), bloom_bits_per_key_(bloom_bits_per_key), - hash_table_ratio_(hash_table_ratio) {} + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness) {} const char* Name() const override { return "PlainTable"; } Status NewTableReader(const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, @@ -71,6 +79,7 @@ class PlainTableFactory : public TableFactory { uint32_t user_key_len_; int bloom_bits_per_key_; double hash_table_ratio_; + size_t index_sparseness_; }; } // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index cf1025097..3595cbe66 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -48,7 +48,7 @@ inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { // Iterator to iterate IndexedTable class PlainTableIterator : public Iterator { public: - explicit PlainTableIterator(PlainTableReader* table); + explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); ~PlainTableIterator(); bool Valid() const; @@ -71,6 +71,7 @@ class PlainTableIterator : public Iterator { private: PlainTableReader* table_; + bool use_prefix_seek_; uint32_t offset_; uint32_t next_offset_; Slice key_; @@ -87,12 +88,14 @@ PlainTableReader::PlainTableReader(const EnvOptions& storage_options, const InternalKeyComparator& icomparator, uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, const TableProperties* table_properties) : soptions_(storage_options), internal_comparator_(icomparator), file_size_(file_size), kHashTableRatio(hash_table_ratio), kBloomBitsPerKey(bloom_bits_per_key), + kIndexIntervalForSamePrefixKeys(index_sparseness), table_properties_(table_properties), data_end_offset_(table_properties_->data_size), user_key_len_(table_properties->fixed_key_len) {} @@ -103,14 +106,12 @@ PlainTableReader::~PlainTableReader() { delete bloom_; } -Status PlainTableReader::Open(const Options& options, - const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table_reader, - const int bloom_bits_per_key, - double hash_table_ratio) { +Status PlainTableReader::Open( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader, const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { assert(options.allow_mmap_reads); if (file_size > kMaxFileSize) { @@ -124,9 +125,9 @@ Status PlainTableReader::Open(const Options& options, return s; } - std::unique_ptr new_reader( - new PlainTableReader(soptions, internal_comparator, file_size, - bloom_bits_per_key, hash_table_ratio, props)); + std::unique_ptr new_reader(new PlainTableReader( + soptions, internal_comparator, file_size, bloom_bits_per_key, + hash_table_ratio, index_sparseness, props)); new_reader->file_ = std::move(file); new_reader->options_ = options; @@ -148,7 +149,7 @@ bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) { } Iterator* PlainTableReader::NewIterator(const ReadOptions& options) { - return new PlainTableIterator(this); + return new PlainTableIterator(this, options.prefix_seek); } struct PlainTableReader::IndexRecord { @@ -204,7 +205,9 @@ class PlainTableReader::IndexRecordList { size_t num_records_in_current_group_; }; -int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) { +Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes, + DynamicBloom* bloom_) const { Slice prev_key_prefix_slice; uint32_t prev_key_prefix_hash = 0; uint32_t pos = data_start_offset_; @@ -214,16 +217,23 @@ int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) { // Need map to be ordered to make sure sub indexes generated // are in order. - int num_prefixes = 0; + *num_prefixes = 0; while (pos < data_end_offset_) { uint32_t key_offset = pos; ParsedInternalKey key; Slice value_slice; - status_ = Next(pos, &key, &value_slice, pos); + Status s = Next(pos, &key, &value_slice, &pos); + if (!s.ok()) { + return s; + } + if (bloom_) { + // total order mode and bloom filter is enabled. + bloom_->AddHash(GetSliceHash(key.user_key)); + } Slice key_prefix_slice = GetPrefix(key); if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { - ++num_prefixes; + ++(*num_prefixes); if (!is_first_record) { keys_per_prefix_hist.Add(key_index_within_prefix); } @@ -232,7 +242,8 @@ int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) { prev_key_prefix_hash = GetSliceHash(key_prefix_slice); } - if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { + if (kIndexIntervalForSamePrefixKeys == 0 || + key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { // Add an index key for every kIndexIntervalForSamePrefixKeys keys record_list->AddRecord(prev_key_prefix_hash, key_offset); } @@ -243,18 +254,27 @@ int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) { Log(options_.info_log, "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist.ToString().c_str()); - return num_prefixes; + return Status::OK(); } void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { delete[] hash_table_; - if (kBloomBitsPerKey > 0) { - bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey); + if (options_.prefix_extractor != nullptr) { + uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; + if (bloom_total_bits > 0) { + bloom_ = new DynamicBloom(bloom_total_bits); + } + } + + if (options_.prefix_extractor == nullptr || kHashTableRatio <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + hash_table_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / kHashTableRatio; + hash_table_size_ = num_prefixes * hash_table_size_multipier + 1; } - double hash_table_size_multipier = - (kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio; - hash_table_size_ = num_prefixes * hash_table_size_multipier + 1; hash_table_ = new uint32_t[hash_table_size_]; } @@ -272,7 +292,7 @@ size_t PlainTableReader::BucketizeIndexesAndFillBloom( if (first || prev_hash != cur_hash) { prev_hash = cur_hash; first = false; - if (bloom_) { + if (bloom_ && !IsTotalOrderMode()) { bloom_->AddHash(cur_hash); } } @@ -362,6 +382,12 @@ void PlainTableReader::FillIndexes( } Status PlainTableReader::PopulateIndex() { + // options.prefix_extractor is requried for a hash-based look-up. + if (options_.prefix_extractor == nullptr && kHashTableRatio != 0) { + return Status::NotSupported( + "PlainTable requires a prefix extractor enable prefix hash mode."); + } + // Get mmapped memory to file_data_. Status s = file_->Read(0, file_size_, &file_data_, nullptr); if (!s.ok()) { @@ -373,7 +399,20 @@ Status PlainTableReader::PopulateIndex() { // for a prefix (starting from the first one), generate a record of (hash, // offset) and append it to IndexRecordList, which is a data structure created // to store them. - int num_prefixes = PopulateIndexRecordList(&record_list); + int num_prefixes; + + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; + if (num_bloom_bits > 0) { + bloom_ = new DynamicBloom(num_bloom_bits); + } + } + + s = PopulateIndexRecordList(&record_list, &num_prefixes, bloom_); + if (!s.ok()) { + return s; + } // Calculated hash table and bloom filter size and allocate memory for indexes // and bloom filter based on the number of prefixes. AllocateIndexAndBloom(num_prefixes); @@ -392,16 +431,16 @@ Status PlainTableReader::PopulateIndex() { Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t prefix_hash, bool& prefix_matched, - uint32_t& ret_offset) { + uint32_t* ret_offset) const { prefix_matched = false; int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_); uint32_t bucket_value = hash_table_[bucket]; if (bucket_value == data_end_offset_) { - ret_offset = data_end_offset_; + *ret_offset = data_end_offset_; return Status::OK(); } else if ((bucket_value & kSubIndexMask) == 0) { // point directly to the file - ret_offset = bucket_value; + *ret_offset = bucket_value; return Status::OK(); } @@ -426,7 +465,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t mid = (high + low) / 2; uint32_t file_offset = base_ptr[mid]; size_t tmp; - Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp); + Status s = ReadKey(file_data_.data() + file_offset, &mid_key, &tmp); if (!s.ok()) { return s; } @@ -438,7 +477,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, // Happen to have found the exact key or target is smaller than the // first key after base_offset. prefix_matched = true; - ret_offset = file_offset; + *ret_offset = file_offset; return Status::OK(); } else { high = mid; @@ -451,34 +490,34 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, ParsedInternalKey low_key; size_t tmp; uint32_t low_key_offset = base_ptr[low]; - Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp); + Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, &tmp); if (GetPrefix(low_key) == prefix) { prefix_matched = true; - ret_offset = low_key_offset; + *ret_offset = low_key_offset; } else if (low + 1 < upper_bound) { // There is possible a next prefix, return it prefix_matched = false; - ret_offset = base_ptr[low + 1]; + *ret_offset = base_ptr[low + 1]; } else { // target is larger than a key of the last prefix in this bucket // but with a different prefix. Key does not exist. - ret_offset = data_end_offset_; + *ret_offset = data_end_offset_; } return Status::OK(); } -bool PlainTableReader::MayHavePrefix(uint32_t hash) { +bool PlainTableReader::MatchBloom(uint32_t hash) const { return bloom_ == nullptr || bloom_->MayContainHash(hash); } -Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) { - return options_.prefix_extractor->Transform(target.user_key); +Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); } Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, - size_t& bytes_read) { + size_t* bytes_read) const { const char* key_ptr = nullptr; - bytes_read = 0; + *bytes_read = 0; size_t user_key_size = 0; if (IsFixedLength()) { user_key_size = user_key_len_; @@ -491,7 +530,7 @@ Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, return Status::Corruption("Unable to read the next key"); } user_key_size = (size_t)tmp_size; - bytes_read = key_ptr - row_ptr; + *bytes_read = key_ptr - row_ptr; } if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) { return Status::Corruption("Unable to read the next key"); @@ -502,7 +541,7 @@ Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, key->user_key = Slice(key_ptr, user_key_size); key->sequence = 0; key->type = kTypeValue; - bytes_read += user_key_size + 1; + *bytes_read += user_key_size + 1; } else { if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) { return Status::Corruption("Unable to read the next key"); @@ -510,16 +549,16 @@ Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) { return Status::Corruption(Slice()); } - bytes_read += user_key_size + 8; + *bytes_read += user_key_size + 8; } return Status::OK(); } Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key, - Slice* value, uint32_t& next_offset) { + Slice* value, uint32_t* next_offset) const { if (offset == data_end_offset_) { - next_offset = data_end_offset_; + *next_offset = data_end_offset_; return Status::OK(); } @@ -529,7 +568,7 @@ Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key, const char* row_ptr = file_data_.data() + offset; size_t bytes_for_key; - Status s = ReadKey(row_ptr, key, bytes_for_key); + Status s = ReadKey(row_ptr, key, &bytes_for_key); uint32_t value_size; const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key, file_data_.data() + data_end_offset_, @@ -537,8 +576,8 @@ Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key, if (value_ptr == nullptr) { return Status::Corruption("Error reading value length."); } - next_offset = offset + (value_ptr - row_ptr) + value_size; - if (next_offset > data_end_offset_) { + *next_offset = offset + (value_ptr - row_ptr) + value_size; + if (*next_offset > data_end_offset_) { return Status::Corruption("Reach end of file when reading value"); } *value = Slice(value_ptr, value_size); @@ -552,14 +591,28 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, const Slice&, bool), void (*mark_key_may_exist)(void*)) { // Check bloom filter first. - Slice prefix_slice = GetPrefix(target); - uint32_t prefix_hash = GetSliceHash(prefix_slice); - if (!MayHavePrefix(prefix_hash)) { - return Status::OK(); + Slice prefix_slice; + uint32_t prefix_hash; + if (IsTotalOrderMode()) { + // Match whole user key for bloom filter check. + if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { + return Status::OK(); + } + // in total order mode, there is only one bucket 0, and we always use empty + // prefix. + prefix_slice = Slice(); + prefix_hash = 0; + } else { + prefix_slice = GetPrefix(target); + prefix_hash = GetSliceHash(prefix_slice); + if (!MatchBloom(prefix_hash)) { + return Status::OK(); + } } uint32_t offset; bool prefix_match; - Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset); + Status s = + GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset); if (!s.ok()) { return s; } @@ -571,7 +624,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, Slice found_value; while (offset < data_end_offset_) { - Status s = Next(offset, &found_key, &found_value, offset); + Status s = Next(offset, &found_key, &found_value, &offset); if (!s.ok()) { return s; } @@ -596,8 +649,9 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { return 0; } -PlainTableIterator::PlainTableIterator(PlainTableReader* table) : - table_(table) { +PlainTableIterator::PlainTableIterator(PlainTableReader* table, + bool use_prefix_seek) + : table_(table), use_prefix_seek_(use_prefix_seek) { next_offset_ = offset_ = table_->data_end_offset_; } @@ -620,18 +674,39 @@ void PlainTableIterator::SeekToFirst() { void PlainTableIterator::SeekToLast() { assert(false); + status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); } void PlainTableIterator::Seek(const Slice& target) { - Slice prefix_slice = table_->GetPrefix(target); - uint32_t prefix_hash = GetSliceHash(prefix_slice); - if (!table_->MayHavePrefix(prefix_hash)) { + // If the user doesn't set prefix seek option and we are not able to do a + // total Seek(). assert failure. + if (!use_prefix_seek_ && table_->hash_table_size_ > 1) { + assert(false); + status_ = Status::NotSupported( + "PlainTable cannot issue non-prefix seek unless in total order mode."); + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash; + uint32_t bloom_hash; + if (table_->IsTotalOrderMode()) { + // The total order mode, there is only one hash bucket 0. The bloom filter + // is checked against the whole user key. + prefix_hash = 0; + bloom_hash = GetSliceHash(table_->GetUserKey(target)); + } else { + prefix_hash = GetSliceHash(prefix_slice); + bloom_hash = prefix_hash; + } + if (!table_->MatchBloom(bloom_hash)) { offset_ = next_offset_ = table_->data_end_offset_; return; } bool prefix_match; status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match, - next_offset_); + &next_offset_); if (!status_.ok()) { offset_ = next_offset_ = table_->data_end_offset_; return; @@ -661,7 +736,7 @@ void PlainTableIterator::Next() { if (offset_ < table_->data_end_offset_) { Slice tmp_slice; ParsedInternalKey parsed_key; - status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_); + status_ = table_->Next(next_offset_, &parsed_key, &value_, &next_offset_); if (status_.ok()) { // Make a copy in this case. TODO optimize. tmp_str_.clear(); diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index dd7b1e50f..03bf11a4e 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -49,7 +49,8 @@ class PlainTableReader: public TableReader { const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table, - const int bloom_bits_per_key, double hash_table_ratio); + const int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness); bool PrefixMayMatch(const Slice& internal_prefix); @@ -71,9 +72,58 @@ class PlainTableReader: public TableReader { PlainTableReader(const EnvOptions& storage_options, const InternalKeyComparator& internal_comparator, uint64_t file_size, int bloom_num_bits, - double hash_table_ratio, + double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties); - ~PlainTableReader(); + virtual ~PlainTableReader(); + + protected: + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + virtual bool MatchBloom(uint32_t hash) const; + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // hash_table_ contains buckets size of hash_table_size_, each is a 32-bit + // integer. The lower 31 bits contain an offset value (explained below) and + // the first bit of the integer indicates type of the offset. + // + // +--------------+------------------------------------------------------+ + // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + + // +--------------+------------------------------------------------------+ + // + // Explanation for the "flag bit": + // + // 0 indicates that the bucket contains only one prefix (no conflict when + // hashing this prefix), whose first row starts from this offset of the + // file. + // 1 indicates that the bucket contains more than one prefixes, or there + // are too many rows for one prefix so we need a binary search for it. In + // this case, the offset indicates the offset of sub_index_ holding the + // binary search indexes of keys for those rows. Those binary search indexes + // are organized in this way: + // + // The first 4 bytes, indicate how many indexes (N) are stored after it. After + // it, there are N 32-bit integers, each points of an offset of the file, + // which + // points to starting of a row. Those offsets need to be guaranteed to be in + // ascending order so the keys they are pointing to are also in ascending + // order + // to make sure we can use them to do binary searches. Below is visual + // presentation of a bucket. + // + // + // number_of_records: varint32 + // record 1 file offset: fixedint32 + // record 2 file offset: fixedint32 + // .... + // record N file offset: fixedint32 + // + Status PopulateIndex(); + + Options options_; + unique_ptr file_; private: struct IndexRecord; @@ -83,11 +133,9 @@ class PlainTableReader: public TableReader { int hash_table_size_ = 0; char* sub_index_ = nullptr; - Options options_; const EnvOptions& soptions_; const InternalKeyComparator internal_comparator_; Status status_; - unique_ptr file_; Slice file_data_; uint32_t version_; @@ -95,6 +143,10 @@ class PlainTableReader: public TableReader { const double kHashTableRatio; const int kBloomBitsPerKey; + // To speed up the search for keys with same prefix, we'll add index key for + // every N keys, where the "N" is determined by + // kIndexIntervalForSamePrefixKeys + const size_t kIndexIntervalForSamePrefixKeys = 16; DynamicBloom* bloom_ = nullptr; std::shared_ptr table_properties_; @@ -107,10 +159,6 @@ class PlainTableReader: public TableReader { static const size_t kOffsetLen = sizeof(uint32_t); static const uint64_t kMaxFileSize = 1u << 31; static const size_t kRecordsPerGroup = 256; - // To speed up the search for keys with same prefix, we'll add index key for - // every N keys, where the "N" is determined by - // kIndexIntervalForSamePrefixKeys - static const size_t kIndexIntervalForSamePrefixKeys = 16; bool IsFixedLength() const { return user_key_len_ != kPlainTableVariableLength; @@ -125,7 +173,10 @@ class PlainTableReader: public TableReader { // Internal helper function to generate an IndexRecordList object from all // the rows, which contains index records as a list. - int PopulateIndexRecordList(IndexRecordList* record_list); + // If bloom_ is not null, all the keys' full-key hash will be added to the + // bloom filter. + Status PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes, DynamicBloom* bloom_) const; // Internal helper function to allocate memory for indexes and bloom filters void AllocateIndexAndBloom(int num_prefixes); @@ -148,72 +199,45 @@ class PlainTableReader: public TableReader { const std::vector& hash_to_offsets, const std::vector& bucket_count); - // PopulateIndex() builds index of keys. It must be called before any query - // to the table. - // - // hash_table_ contains buckets size of hash_table_size_, each is a 32-bit - // integer. The lower 31 bits contain an offset value (explained below) and - // the first bit of the integer indicates type of the offset. - // - // +--------------+------------------------------------------------------+ - // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + - // +--------------+------------------------------------------------------+ - // - // Explanation for the "flag bit": - // - // 0 indicates that the bucket contains only one prefix (no conflict when - // hashing this prefix), whose first row starts from this offset of the - // file. - // 1 indicates that the bucket contains more than one prefixes, or there - // are too many rows for one prefix so we need a binary search for it. In - // this case, the offset indicates the offset of sub_index_ holding the - // binary search indexes of keys for those rows. Those binary search indexes - // are organized in this way: - // - // The first 4 bytes, indicate how many indexes (N) are stored after it. After - // it, there are N 32-bit integers, each points of an offset of the file, - // which - // points to starting of a row. Those offsets need to be guaranteed to be in - // ascending order so the keys they are pointing to are also in ascending - // order - // to make sure we can use them to do binary searches. Below is visual - // presentation of a bucket. - // - // - // number_of_records: varint32 - // record 1 file offset: fixedint32 - // record 2 file offset: fixedint32 - // .... - // record N file offset: fixedint32 - // - Status PopulateIndex(); - - // Check bloom filter to see whether it might contain this prefix. - // The hash of the prefix is given, since it can be reused for index lookup - // too. - bool MayHavePrefix(uint32_t hash); - Status ReadKey(const char* row_ptr, ParsedInternalKey* key, - size_t& bytes_read); + size_t* bytes_read) const; // Read the key and value at offset to key and value. // tmp_slice is a tmp slice. // return next_offset as the offset for the next key. Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value, - uint32_t& next_offset); + uint32_t* next_offset) const; // Get file offset for key target. // return value prefix_matched is set to true if the offset is confirmed // for a key with the same prefix as target. Status GetOffset(const Slice& target, const Slice& prefix, uint32_t prefix_hash, bool& prefix_matched, - uint32_t& ret_offset); + uint32_t* ret_offset) const; + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } - Slice GetPrefix(const Slice& target) { + Slice GetPrefix(const Slice& target) const { assert(target.size() >= 8); // target is internal key - return options_.prefix_extractor->Transform( - Slice(target.data(), target.size() - 8)); + return GetPrefixFromUserKey(GetUserKey(target)); } - Slice GetPrefix(const ParsedInternalKey& target); + inline Slice GetPrefix(const ParsedInternalKey& target) const; + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return options_.prefix_extractor->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. In that case, + // it falls back to pure binary search and total iterator seek is + // supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { + return (options_.prefix_extractor == nullptr); + } // No copying allowed explicit PlainTableReader(const TableReader&) = delete; diff --git a/table/table_test.cc b/table/table_test.cc index 836008564..bef5caac1 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -306,8 +306,11 @@ class KeyConvertingIterator: public Iterator { class TableConstructor: public Constructor { public: explicit TableConstructor(const Comparator* cmp, - bool convert_to_internal_key = false) - : Constructor(cmp), convert_to_internal_key_(convert_to_internal_key) {} + bool convert_to_internal_key = false, + bool prefix_seek = false) + : Constructor(cmp), + convert_to_internal_key_(convert_to_internal_key), + prefix_seek_(prefix_seek) {} ~TableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, @@ -347,7 +350,11 @@ class TableConstructor: public Constructor { } virtual Iterator* NewIterator() const { - Iterator* iter = table_reader_->NewIterator(ReadOptions()); + ReadOptions ro; + if (prefix_seek_) { + ro.prefix_seek = true; + } + Iterator* iter = table_reader_->NewIterator(ro); if (convert_to_internal_key_) { return new KeyConvertingIterator(iter); } else { @@ -380,6 +387,7 @@ class TableConstructor: public Constructor { source_.reset(); } bool convert_to_internal_key_; + bool prefix_seek_; uint64_t uniq_id_; unique_ptr sink_; @@ -548,6 +556,7 @@ enum TestType { BLOCK_BASED_TABLE_TEST, PLAIN_TABLE_SEMI_FIXED_PREFIX, PLAIN_TABLE_FULL_STR_PREFIX, + PLAIN_TABLE_TOTAL_ORDER, BLOCK_TEST, MEMTABLE_TEST, DB_TEST @@ -564,8 +573,9 @@ static std::vector GenerateArgList() { std::vector test_args; std::vector test_types = { BLOCK_BASED_TABLE_TEST, PLAIN_TABLE_SEMI_FIXED_PREFIX, - PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST, - MEMTABLE_TEST, DB_TEST}; + PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER, + BLOCK_TEST, MEMTABLE_TEST, + DB_TEST}; std::vector reverse_compare_types = {false, true}; std::vector restart_intervals = {16, 1, 1024}; @@ -688,8 +698,8 @@ class Harness { only_support_prefix_seek_ = true; options_.prefix_extractor = prefix_transform.get(); options_.allow_mmap_reads = true; - options_.table_factory.reset(new PlainTableFactory()); - constructor_ = new TableConstructor(options_.comparator, true); + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, true); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -698,8 +708,18 @@ class Harness { only_support_prefix_seek_ = true; options_.prefix_extractor = noop_transform.get(); options_.allow_mmap_reads = true; - options_.table_factory.reset(new PlainTableFactory()); - constructor_ = new TableConstructor(options_.comparator, true); + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_TOTAL_ORDER: + support_prev_ = false; + only_support_prefix_seek_ = false; + options_.prefix_extractor = nullptr; + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewTotalOrderPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, false); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break;