diff --git a/.arcconfig b/.arcconfig index 0c4c85e69..85ca38f25 100644 --- a/.arcconfig +++ b/.arcconfig @@ -6,6 +6,5 @@ "linters" ], "lint.engine" : "FacebookFbcodeLintEngine", - "lint.engine.single.linter" : "FbcodeCppLinter", - "lint.cpplint.prefix" : "linters" + "lint.engine.single.linter" : "FbcodeCppLinter" } diff --git a/.gitignore b/.gitignore index 48c56f432..a4cddf141 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ build_config.mk *_test *_bench *_stress +*.out ldb manifest_dump diff --git a/Makefile b/Makefile index 3b59a4ee5..04d8466a5 100644 --- a/Makefile +++ b/Makefile @@ -97,7 +97,7 @@ TOOLS = \ blob_store_bench -PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS) +PROGRAMS = db_bench signal_test table_reader_bench $(TESTS) $(TOOLS) BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench # The library name is configurable since we are maintaining libraries of both diff --git a/db/db_impl.cc b/db/db_impl.cc index 3c3b60e13..53fdbef18 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3604,6 +3604,23 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { return s; } +Status DBImpl::GetPropertiesOfAllTables(TablePropertiesCollection* props) { + // Increment the ref count + mutex_.Lock(); + auto version = versions_->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfAllTables(props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + const std::string& DBImpl::GetName() const { return dbname_; } diff --git a/db/db_impl.h b/db/db_impl.h index ffffdce1c..5292e90cc 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -494,6 +494,9 @@ class DBImpl : public DB { void InstallSuperVersion(ColumnFamilyData* cfd, DeletionState& deletion_state); + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) + override; + // Function that Get and KeyMayExist call with no_io true or false // Note: 'value_found' from KeyMayExist propagates here Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, diff --git a/db/db_test.cc b/db/db_test.cc index 312fbfdc8..ee1c4651a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -10,6 +10,7 @@ #include #include #include +#include #include "db/dbformat.h" #include "db/db_impl.h" @@ -26,6 +27,7 @@ #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "rocksdb/table_properties.h" #include "table/block_based_table_factory.h" #include "util/hash.h" #include "util/hash_linklist_rep.h" @@ -953,6 +955,28 @@ static long TestGetTickerCount(const Options& options, Tickers ticker_type) { return options.statistics->getTickerCount(ticker_type); } +// A helper function that ensures the table properties returned in +// `GetPropertiesOfAllTablesTest` is correct. +// This test assumes entries size is differnt for each of the tables. +void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { + TablePropertiesCollection props; + ASSERT_OK(db->GetPropertiesOfAllTables(&props)); + + assert(props.size() == 4); + ASSERT_EQ(4U, props.size()); + std::unordered_set unique_entries; + + // Indirect test + uint64_t sum = 0; + for (const auto& item : props) { + unique_entries.insert(item.second->num_entries); + sum += item.second->num_entries; + } + + ASSERT_EQ(props.size(), unique_entries.size()); + ASSERT_EQ(expected_entries_size, sum); +} + TEST(DBTest, Empty) { do { Options options = CurrentOptions(); @@ -1041,6 +1065,41 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); } +TEST(DBTest, GetPropertiesOfAllTablesTest) { + Options options = CurrentOptions(); + Reopen(&options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + // 1. Read table properties directly from file + Reopen(&options); + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 2. Put two tables to table cache and + Reopen(&options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 2; ++i) { + Get(std::to_string(i * 100 + 0)); + } + + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 3. Put all tables to table cache + Reopen(&options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 4; ++i) { + Get(std::to_string(i * 100 + 0)); + } + VerifyTableProperties(db_, 10 + 11 + 12 + 13); +} + TEST(DBTest, LevelLimitReopen) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, &options); @@ -5073,6 +5132,11 @@ class ModelDB: public DB { Status::NotSupported("Not implemented.")); return s; } + + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return Status(); + } + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 0d554278c..3ad7ce8d7 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -22,7 +22,9 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "table/meta_blocks.h" #include "table/plain_table_factory.h" +#include "table/plain_table_reader.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" @@ -46,6 +48,7 @@ class PlainTableDBTest { public: PlainTableDBTest() : env_(Env::Default()) { + ro_.prefix_seek = true; dbname_ = test::TmpDir() + "/plain_table_db_test"; ASSERT_OK(DestroyDB(dbname_, Options())); db_ = nullptr; @@ -57,10 +60,12 @@ class PlainTableDBTest { ASSERT_OK(DestroyDB(dbname_, Options())); } + ReadOptions ro_; + // Return the current option configuration. Options CurrentOptions() { Options options; - options.table_factory.reset(new PlainTableFactory(16, 2, 0.8)); + options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3)); options.prefix_extractor = prefix_transform.get(); options.allow_mmap_reads = true; return options; @@ -119,7 +124,7 @@ class PlainTableDBTest { } std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { - ReadOptions options; + ReadOptions options = ro_; options.snapshot = snapshot; std::string result; Status s = db_->Get(options, k, &result); @@ -176,25 +181,296 @@ TEST(PlainTableDBTest, Empty) { ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); } -TEST(PlainTableDBTest, ReadWrite) { - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_EQ("v1", Get("1000000000000foo")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("1000000000000foo", "v3")); - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); -} +class TestPlainTableReader : public PlainTableReader { + public: + TestPlainTableReader(const EnvOptions& storage_options, + const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + const TableProperties* table_properties, + unique_ptr&& file, + const Options& options, bool* expect_bloom_not_match) + : PlainTableReader(options, std::move(file), storage_options, icomparator, + file_size, bloom_bits_per_key, hash_table_ratio, + index_sparseness, table_properties), + expect_bloom_not_match_(expect_bloom_not_match) { + Status s = PopulateIndex(); + ASSERT_TRUE(s.ok()); + } + + virtual ~TestPlainTableReader() {} + + private: + virtual bool MatchBloom(uint32_t hash) const override { + bool ret = PlainTableReader::MatchBloom(hash); + ASSERT_TRUE(!*expect_bloom_not_match_ || !ret); + return ret; + } + bool* expect_bloom_not_match_; +}; + +extern const uint64_t kPlainTableMagicNumber; +class TestPlainTableFactory : public PlainTableFactory { + public: + explicit TestPlainTableFactory(bool* expect_bloom_not_match, + uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_key = 0, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16) + : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio, + hash_table_ratio), + user_key_len_(user_key_len), + bloom_bits_per_key_(bloom_bits_per_key), + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness), + expect_bloom_not_match_(expect_bloom_not_match) {} + + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override { + TableProperties* props = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + options.env, options.info_log.get(), &props); + ASSERT_TRUE(s.ok()); + + std::unique_ptr new_reader(new TestPlainTableReader( + soptions, internal_comparator, file_size, bloom_bits_per_key_, + hash_table_ratio_, index_sparseness_, props, std::move(file), options, + expect_bloom_not_match_)); + + *table = std::move(new_reader); + return s; + } + + private: + uint32_t user_key_len_; + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; + bool* expect_bloom_not_match_; +}; TEST(PlainTableDBTest, Flush) { - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); + for (int bloom_bits = 0; bloom_bits <= 8; bloom_bits += 8) { + for (int total_order = 0; total_order <= 1; total_order++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.table_factory.reset( + NewTotalOrderPlainTableFactory(16, bloom_bits, 2)); + } else { + options.table_factory.reset(NewPlainTableFactory(16, bloom_bits)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + } + } +} + +TEST(PlainTableDBTest, Flush2) { + for (int bloom_bits = 0; bloom_bits <= 10; bloom_bits += 10) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("1000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + } + expect_bloom_not_match = false; + } + } + } } TEST(PlainTableDBTest, Iterator) { + for (int bloom_bits = 0; bloom_bits <= 8; bloom_bits += 8) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + + // Key doesn't exist any more but prefix exists. + if (total_order) { + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + } + expect_bloom_not_match = false; + } + + delete iter; + } + } +} + +// A test comparator which compare two strings in this way: +// (1) first compare prefix of 8 bytes in alphabet order, +// (2) if two strings share the same prefix, sort the other part of the string +// in the reverse alphabet order. +class SimpleSuffixReverseComparator : public Comparator { + public: + SimpleSuffixReverseComparator() {} + + virtual const char* Name() const { return "SimpleSuffixReverseComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + Slice prefix_a = Slice(a.data(), 8); + Slice prefix_b = Slice(b.data(), 8); + int prefix_comp = prefix_a.compare(prefix_b); + if (prefix_comp != 0) { + return prefix_comp; + } else { + Slice suffix_a = Slice(a.data() + 8, a.size() - 8); + Slice suffix_b = Slice(b.data() + 8, b.size() - 8); + return -(suffix_a.compare(suffix_b)); + } + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; + +TEST(PlainTableDBTest, IteratorReverseSuffixComparator) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + DestroyAndReopen(&options); + ASSERT_OK(Put("1000000000foo002", "v_2")); ASSERT_OK(Put("0000000000000bar", "random")); ASSERT_OK(Put("1000000000foo001", "v1")); @@ -207,22 +483,21 @@ TEST(PlainTableDBTest, Iterator) { dbfull()->TEST_FlushMemTable(); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); - ReadOptions ro; - Iterator* iter = dbfull()->NewIterator(ro); - iter->Seek("1000000000foo001"); + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek("1000000000foo009"); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); iter->Next(); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo002", iter->key().ToString()); - ASSERT_EQ("v_2", iter->value().ToString()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); iter->Next(); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo003", iter->key().ToString()); - ASSERT_EQ("v__3", iter->value().ToString()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); iter->Next(); ASSERT_TRUE(iter->Valid()); @@ -234,11 +509,6 @@ TEST(PlainTableDBTest, Iterator) { ASSERT_EQ("3000000000000bar", iter->key().ToString()); ASSERT_EQ("bar_v", iter->value().ToString()); - iter->Seek("1000000000foo000"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); - iter->Seek("1000000000foo005"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("1000000000foo005", iter->key().ToString()); @@ -246,42 +516,220 @@ TEST(PlainTableDBTest, Iterator) { iter->Seek("1000000000foo006"); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo007", iter->key().ToString()); - ASSERT_EQ("v__7", iter->value().ToString()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); iter->Seek("1000000000foo008"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("1000000000foo008", iter->key().ToString()); ASSERT_EQ("v__8", iter->value().ToString()); - iter->Seek("1000000000foo009"); + iter->Seek("1000000000foo000"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("3000000000000bar", iter->key().ToString()); - delete iter; } -TEST(PlainTableDBTest, Flush2) { - ASSERT_OK(Put("0000000000000bar", "b")); - ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); +TEST(PlainTableDBTest, HashBucketConflict) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo8"); + ASSERT_TRUE(!iter->Valid() || + options.comparator->Compare(iter->key(), "20000001") > 0); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } +} - ASSERT_OK(Put("1000000000000foo", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("1000000000000foo")); +TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo3", iter->key().ToString()); + + iter->Seek("5000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo2", iter->key().ToString()); + + std::string seek_key = "2000000000000bar"; + iter->Seek(seek_key); + ASSERT_TRUE(!iter->Valid() || + options.prefix_extractor->Transform(iter->key()) != + options.prefix_extractor->Transform(seek_key)); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } +} - ASSERT_OK(Put("0000000000000eee", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("0000000000000eee")); +TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v3")); - ASSERT_OK(Delete("0000000000000bar")); dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); - ASSERT_OK(Put("0000000000000eee", "v5")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v5", Get("0000000000000eee")); + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("5000000000000fo2")); + + ASSERT_EQ("NOT_FOUND", Get("8000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + + Iterator* iter = dbfull()->NewIterator(ro_); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; } static std::string Key(int i) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 3301b98d9..c03ab5e1a 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -158,6 +158,32 @@ Status TableCache::Get(const ReadOptions& options, } return s; } +Status TableCache::GetTableProperties( + const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, bool no_io) { + Status s; + auto table_handle = file_meta.table_reader_handle; + // table already been pre-loaded? + if (table_handle) { + auto table = GetTableReaderFromHandle(table_handle); + *properties = table->GetTableProperties(); + return s; + } + + bool table_io; + s = FindTable(toptions, internal_comparator, file_meta.number, + file_meta.file_size, &table_handle, &table_io, no_io); + if (!s.ok()) { + return s; + } + assert(table_handle); + auto table = GetTableReaderFromHandle(table_handle); + *properties = table->GetTableProperties(); + ReleaseHandle(table_handle); + return s; +} bool TableCache::PrefixMayMatch(const ReadOptions& options, const InternalKeyComparator& icomparator, diff --git a/db/table_cache.h b/db/table_cache.h index 44f47e353..865c4bec2 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -73,6 +73,18 @@ class TableCache { // Get TableReader from a cache handle. TableReader* GetTableReaderFromHandle(Cache::Handle* handle); + // Get the table properties of a given table. + // @no_io: indicates if we should load table to the cache if it is not present + // in table cache yet. + // @returns: `properties` will be reset on success. Please note that we will + // return Status::Incomplete() if table is not present in cache and + // we set `no_io` to be true. + Status GetTableProperties(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + bool no_io = false); + // Release the handle from a cache void ReleaseHandle(Cache::Handle* handle); diff --git a/db/version_set.cc b/db/version_set.cc index 972b887f5..55ba9fead 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -27,6 +27,8 @@ #include "table/table_reader.h" #include "table/merger.h" #include "table/two_level_iterator.h" +#include "table/format.h" +#include "table/meta_blocks.h" #include "util/coding.h" #include "util/logging.h" #include "util/stop_watch.h" @@ -241,6 +243,59 @@ bool Version::PrefixMayMatch(const ReadOptions& options, return may_match; } +Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { + auto table_cache = vset_->table_cache_; + auto options = vset_->options_; + for (int level = 0; level < num_levels_; level++) { + for (const auto& file_meta : files_[level]) { + auto fname = TableFileName(vset_->dbname_, file_meta->number); + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = table_cache->GetTableProperties( + vset_->storage_options_, vset_->icmp_, *file_meta, &table_properties, + true /* no io */); + if (s.ok()) { + props->insert({fname, table_properties}); + continue; + } + + // We only ignore error type `Incomplete` since it's by design that we + // disallow table when it's not in table cache. + if (!s.IsIncomplete()) { + return s; + } + + // 2. Table is not present in table cache, we'll read the table properties + // directly from the properties block in the file. + std::unique_ptr file; + s = vset_->env_->NewRandomAccessFile(fname, &file, + vset_->storage_options_); + if (!s.ok()) { + return s; + } + + TableProperties* raw_table_properties; + // By setting the magic number to kInvalidTableMagicNumber, we can by + // pass the magic number check in the footer. + s = ReadTableProperties( + file.get(), file_meta->file_size, + Footer::kInvalidTableMagicNumber /* table's magic number */, + vset_->env_, options->info_log.get(), &raw_table_properties); + if (!s.ok()) { + return s; + } + RecordTick(options->statistics.get(), + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + + props->insert({fname, std::shared_ptr( + raw_table_properties)}); + } + } + + return Status::OK(); +} + Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, const EnvOptions& soptions, int level) const { diff --git a/db/version_set.h b/db/version_set.h index 49e47b2d3..10a3a50bb 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -193,6 +193,12 @@ class Version { // Returns the version nuber of this version uint64_t GetVersionNumber() const { return version_number_; } + // REQUIRES: lock is held + // On success, *props will be populated with all SSTables' table properties. + // The keys of `props` are the sst file name, the values of `props` are the + // tables' propertis, represented as shared_ptr. + Status GetPropertiesOfAllTables(TablePropertiesCollection* props); + // used to sort files by size struct Fsize { int index; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index c800eac27..a98172eac 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -14,6 +14,7 @@ #include #include #include +#include #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/types.h" @@ -47,6 +48,7 @@ struct Options; struct ReadOptions; struct WriteOptions; struct FlushOptions; +struct TableProperties; class WriteBatch; class Env; @@ -78,6 +80,12 @@ struct Range { Range(const Slice& s, const Slice& l) : start(s), limit(l) { } }; +// A collections of table properties objects, where +// key: is the table's file name. +// value: the table properties object of the given table. +typedef std::unordered_map> + TablePropertiesCollection; + // A DB is a persistent ordered map from keys to values. // A DB is safe for concurrent access from multiple threads without // any external synchronization. @@ -427,6 +435,8 @@ class DB { // Returns default column family handle virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) = 0; + private: // No copying allowed DB(const DB&); diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index cddd74bf8..24384e9ce 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -57,9 +57,9 @@ enum Tickers { * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction * There are 3 reasons currently. */ - COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. - COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. - COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. // Number of keys written to the database via the Put and Write call's NUMBER_KEYS_WRITTEN, @@ -80,8 +80,7 @@ enum Tickers { // write throttle because of too many files in L0 STALL_L0_NUM_FILES_MICROS, RATE_LIMIT_DELAY_MILLIS, - - NO_ITERATORS, // number of iterators currently open + NO_ITERATORS, // number of iterators currently open // Number of MultiGet calls, keys read, and bytes read NUMBER_MULTIGET_CALLS, @@ -107,77 +106,77 @@ enum Tickers { // Record the number of calls to GetUpadtesSince. Useful to keep track of // transaction log iterator refreshes GET_UPDATES_SINCE_CALLS, - - BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache - BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache - - WAL_FILE_SYNCED, // Number of times WAL sync is done - WAL_FILE_BYTES, // Number of bytes written to WAL + BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL // Writes can be processed by requesting thread or by the thread at the // head of the writers queue. WRITE_DONE_BY_SELF, WRITE_DONE_BY_OTHER, + WRITE_WITH_WAL, // Number of Write calls that request WAL + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction - WRITE_WITH_WAL, // Number of Write calls that request WAL - - COMPACT_READ_BYTES, // Bytes read during compaction - COMPACT_WRITE_BYTES, // Bytes written during compaction - + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, TICKER_ENUM_MAX }; // The order of items listed in Tickers should be the same as // the order listed in TickersNameMap const std::vector> TickersNameMap = { - { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, - { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, - { BLOCK_CACHE_ADD, "rocksdb.block.cache.add" }, - { BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" }, - { BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" }, - { BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" }, - { BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" }, - { BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" }, - { BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" }, - { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" }, - { MEMTABLE_HIT, "rocksdb.memtable.hit" }, - { MEMTABLE_MISS, "rocksdb.memtable.miss" }, - { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" }, - { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" }, - { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" }, - { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" }, - { NUMBER_KEYS_READ, "rocksdb.number.keys.read" }, - { NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" }, - { BYTES_WRITTEN, "rocksdb.bytes.written" }, - { BYTES_READ, "rocksdb.bytes.read" }, - { NO_FILE_CLOSES, "rocksdb.no.file.closes" }, - { NO_FILE_OPENS, "rocksdb.no.file.opens" }, - { NO_FILE_ERRORS, "rocksdb.no.file.errors" }, - { STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" }, - { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" }, - { STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" }, - { RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" }, - { NO_ITERATORS, "rocksdb.num.iterators" }, - { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" }, - { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" }, - { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" }, - { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" }, - { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" }, - { SEQUENCE_NUMBER, "rocksdb.sequence.number" }, - { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" }, - { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }, - { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }, - { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" }, - { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" }, - { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }, - { WAL_FILE_SYNCED, "rocksdb.wal.synced" }, - { WAL_FILE_BYTES, "rocksdb.wal.bytes" }, - { WRITE_DONE_BY_SELF, "rocksdb.write.self" }, - { WRITE_DONE_BY_OTHER, "rocksdb.write.other" }, - { WRITE_WITH_WAL, "rocksdb.write.wal" }, - { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" }, - { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" }, -}; + {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"}, + {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"}, + {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"}, + {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"}, + {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"}, + {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"}, + {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"}, + {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"}, + {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"}, + {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, + {MEMTABLE_HIT, "rocksdb.memtable.hit"}, + {MEMTABLE_MISS, "rocksdb.memtable.miss"}, + {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"}, + {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"}, + {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"}, + {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"}, + {NUMBER_KEYS_READ, "rocksdb.number.keys.read"}, + {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"}, + {BYTES_WRITTEN, "rocksdb.bytes.written"}, + {BYTES_READ, "rocksdb.bytes.read"}, + {NO_FILE_CLOSES, "rocksdb.no.file.closes"}, + {NO_FILE_OPENS, "rocksdb.no.file.opens"}, + {NO_FILE_ERRORS, "rocksdb.no.file.errors"}, + {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"}, + {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"}, + {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"}, + {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"}, + {NO_ITERATORS, "rocksdb.num.iterators"}, + {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"}, + {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"}, + {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"}, + {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"}, + {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"}, + {SEQUENCE_NUMBER, "rocksdb.sequence.number"}, + {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"}, + {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"}, + {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"}, + {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, + {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"}, + {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"}, + {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, + {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, + {WRITE_DONE_BY_SELF, "rocksdb.write.self"}, + {WRITE_DONE_BY_OTHER, "rocksdb.write.other"}, + {WRITE_WITH_WAL, "rocksdb.write.wal"}, + {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, + {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, + {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + "rocksdb.number.direct.load.table.properties"}, }; /** * Keep adding histogram's here. diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index d4965ca45..5c04257ff 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -60,20 +60,44 @@ struct BlockBasedTableOptions { extern TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); -// -- Plain Table +// -- Plain Table with prefix-only seek +// For this factory, you need to set Options.prefix_extrator properly to make it +// work. Look-up will starts with prefix hash lookup for key prefix. Inside the +// hash bucket found, a binary search is executed for hash conflicts. Finally, +// a linear search is used. // @user_key_len: plain table has optimization for fix-sized keys, which can be // specified via user_key_len. Alternatively, you can pass // `kPlainTableVariableLength` if your keys have variable // lengths. -// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may -// disable it by passing a zero. +// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You +// may disable it by passing a zero. // @hash_table_ratio: the desired utilization of the hash table used for prefix // hashing. hash_table_ratio = number of prefixes / #buckets // in the hash table +// @index_sparseness: inside each prefix, need to build one index record for how +// many keys for binary search inside each hash bucket. const uint32_t kPlainTableVariableLength = 0; -extern TableFactory* NewPlainTableFactory( +extern TableFactory* NewPlainTableFactory(uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_prefix = 10, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16); + +// -- Plain Table +// This factory of plain table ignores Options.prefix_extractor and assumes no +// hashable prefix available to the key structure. Lookup will be based on +// binary search index only. Total order seek() can be issued. +// @user_key_len: plain table has optimization for fix-sized keys, which can be +// specified via user_key_len. Alternatively, you can pass +// `kPlainTableVariableLength` if your keys have variable +// lengths. +// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may +// disable it by passing a zero. +// @index_sparseness: need to build one index record for how many keys for +// binary search. +extern TableFactory* NewTotalOrderPlainTableFactory( uint32_t user_key_len = kPlainTableVariableLength, - int bloom_bits_per_key = 10, double hash_table_ratio = 0.75); + int bloom_bits_per_key = 0, size_t index_sparseness = 16); // A base class for table factories. class TableFactory { diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h index e7c9583b2..d859a6a6f 100644 --- a/include/utilities/stackable_db.h +++ b/include/utilities/stackable_db.h @@ -182,6 +182,10 @@ class StackableDB : public DB { return db_->GetDbIdentity(identity); } + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return db_->GetPropertiesOfAllTables(props); + } + virtual Status GetUpdatesSince(SequenceNumber seq_number, unique_ptr* iter) override { diff --git a/linters/__phutil_library_map__.php b/linters/__phutil_library_map__.php index cb10bed69..7808dc1a4 100644 --- a/linters/__phutil_library_map__.php +++ b/linters/__phutil_library_map__.php @@ -13,6 +13,7 @@ phutil_register_library_map(array( 'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php', 'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php', 'PfffCppLinter' => 'cpp_linter/PfffCppLinter.php', + 'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php', ), 'function' => array( diff --git a/linters/cpp_linter/ArcanistCpplintLinter.php b/linters/cpp_linter/ArcanistCpplintLinter.php new file mode 100644 index 000000000..cb7842248 --- /dev/null +++ b/linters/cpp_linter/ArcanistCpplintLinter.php @@ -0,0 +1,113 @@ +getEngine()->getConfigurationManager(); + $options = $config->getConfigFromAnySource('lint.cpplint.options', ''); + + return $options; + } + + public function getLintPath() { + $config = $this->getEngine()->getConfigurationManager(); + $prefix = $config->getConfigFromAnySource('lint.cpplint.prefix'); + $bin = $config->getConfigFromAnySource('lint.cpplint.bin', 'cpplint.py'); + + if ($prefix !== null) { + if (!Filesystem::pathExists($prefix.'/'.$bin)) { + throw new ArcanistUsageException( + "Unable to find cpplint.py binary in a specified directory. Make ". + "sure that 'lint.cpplint.prefix' and 'lint.cpplint.bin' keys are ". + "set correctly. If you'd rather use a copy of cpplint installed ". + "globally, you can just remove these keys from your .arcconfig."); + } + + $bin = csprintf("%s/%s", $prefix, $bin); + + return $bin; + } + + // Search under current dir + list($err) = exec_manual('which %s/%s', $this->linterDir(), $bin); + if (!$err) { + return $this->linterDir().'/'.$bin; + } + + // Look for globally installed cpplint.py + list($err) = exec_manual('which %s', $bin); + if ($err) { + throw new ArcanistUsageException( + "cpplint.py does not appear to be installed on this system. Install ". + "it (e.g., with 'wget \"http://google-styleguide.googlecode.com/". + "svn/trunk/cpplint/cpplint.py\"') or configure 'lint.cpplint.prefix' ". + "in your .arcconfig to point to the directory where it resides. ". + "Also don't forget to chmod a+x cpplint.py!"); + } + + return $bin; + } + + public function lintPath($path) { + $bin = $this->getLintPath(); + $options = $this->getLintOptions(); + $path = $this->rocksdbDir().'/'.$path; + + $f = new ExecFuture("%C %C $path", $bin, $options); + + list($err, $stdout, $stderr) = $f->resolve(); + + if ($err === 2) { + throw new Exception("cpplint failed to run correctly:\n".$stderr); + } + + $lines = explode("\n", $stderr); + $messages = array(); + foreach ($lines as $line) { + $line = trim($line); + $matches = null; + $regex = '/^[^:]+:(\d+):\s*(.*)\s*\[(.*)\] \[(\d+)\]$/'; + if (!preg_match($regex, $line, $matches)) { + continue; + } + foreach ($matches as $key => $match) { + $matches[$key] = trim($match); + } + $message = new ArcanistLintMessage(); + $message->setPath($path); + $message->setLine($matches[1]); + $message->setCode($matches[3]); + $message->setName($matches[3]); + $message->setDescription($matches[2]); + $message->setSeverity(ArcanistLintSeverity::SEVERITY_WARNING); + $this->addLintMessage($message); + } + } + + // The path of this linter + private function linterDir() { + return dirname(__FILE__); + } + + // TODO(kaili) a quick and dirty way to figure out rocksdb's root dir. + private function rocksdbDir() { + return $this->linterDir()."/../.."; + } +} diff --git a/linters/cpplint.py b/linters/cpp_linter/cpplint.py similarity index 100% rename from linters/cpplint.py rename to linters/cpp_linter/cpplint.py diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index fa84b5a38..4465899fb 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -270,12 +270,20 @@ Status ReadTableMagicNumber(const std::string& file_path, uint64_t file_size; options.env->GetFileSize(file_path, &file_size); + return ReadTableMagicNumber(file.get(), file_size, options, env_options, + table_magic_number); +} + +Status ReadTableMagicNumber(RandomAccessFile* file, uint64_t file_size, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number) { if (file_size < Footer::kEncodedLength) { return Status::InvalidArgument("file is too short to be an sstable"); } Footer footer; - s = ReadFooterFromFile(file.get(), file_size, &footer); + auto s = ReadFooterFromFile(file, file_size, &footer); if (!s.ok()) { return s; } diff --git a/table/meta_blocks.h b/table/meta_blocks.h index f74e66592..a355905a1 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -124,4 +124,9 @@ Status ReadTableMagicNumber(const std::string& file_path, const Options& options, const EnvOptions& env_options, uint64_t* table_magic_number); + +Status ReadTableMagicNumber(RandomAccessFile* file, uint64_t file_size, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number); } // namespace rocksdb diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index c7ee8eb2f..16ee24eb4 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -21,7 +21,7 @@ Status PlainTableFactory::NewTableReader(const Options& options, unique_ptr* table) const { return PlainTableReader::Open(options, soptions, icomp, std::move(file), file_size, table, bloom_bits_per_key_, - hash_table_ratio_); + hash_table_ratio_, index_sparseness_); } TableBuilder* PlainTableFactory::NewTableBuilder( @@ -32,9 +32,17 @@ TableBuilder* PlainTableFactory::NewTableBuilder( extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, int bloom_bits_per_key, - double hash_table_ratio) { + double hash_table_ratio, + size_t index_sparseness) { return new PlainTableFactory(user_key_len, bloom_bits_per_key, - hash_table_ratio); + hash_table_ratio, index_sparseness); +} + +extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len, + int bloom_bits_per_key, + size_t index_sparseness) { + return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0, + index_sparseness); } } // namespace rocksdb diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index 382efe3c1..a0a7fbe6f 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -48,12 +48,20 @@ class PlainTableFactory : public TableFactory { // number of bits used for bloom filer per key. hash_table_ratio is // the desired utilization of the hash table used for prefix hashing. // hash_table_ratio = number of prefixes / #buckets in the hash table + // hash_table_ratio = 0 means skip hash table but only replying on binary + // search. + // index_sparseness determines index interval for keys + // inside the same prefix. It will be the maximum number of linear search + // required after hash and binary search. + // index_sparseness = 0 means index for every key. explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, int bloom_bits_per_key = 0, - double hash_table_ratio = 0.75) + double hash_table_ratio = 0.75, + size_t index_sparseness = 16) : user_key_len_(user_key_len), bloom_bits_per_key_(bloom_bits_per_key), - hash_table_ratio_(hash_table_ratio) {} + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness) {} const char* Name() const override { return "PlainTable"; } Status NewTableReader(const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, @@ -71,6 +79,7 @@ class PlainTableFactory : public TableFactory { uint32_t user_key_len_; int bloom_bits_per_key_; double hash_table_ratio_; + size_t index_sparseness_; }; } // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index cf1025097..593530c87 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -5,6 +5,7 @@ #include "table/plain_table_reader.h" #include +#include #include "db/dbformat.h" @@ -35,7 +36,7 @@ namespace rocksdb { namespace { -inline uint32_t GetSliceHash(Slice const& s) { +inline uint32_t GetSliceHash(const Slice& s) { return Hash(s.data(), s.size(), 397) ; } @@ -43,12 +44,18 @@ inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { return hash % num_buckets; } +// Safely getting a uint32_t element from a char array, where, starting from +// `base`, every 4 bytes are considered as an fixed 32 bit integer. +inline uint32_t GetFixed32Element(const char* base, size_t offset) { + return DecodeFixed32(base + offset * sizeof(uint32_t)); +} + } // namespace // Iterator to iterate IndexedTable class PlainTableIterator : public Iterator { public: - explicit PlainTableIterator(PlainTableReader* table); + explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); ~PlainTableIterator(); bool Valid() const; @@ -71,6 +78,7 @@ class PlainTableIterator : public Iterator { private: PlainTableReader* table_; + bool use_prefix_seek_; uint32_t offset_; uint32_t next_offset_; Slice key_; @@ -83,34 +91,34 @@ class PlainTableIterator : public Iterator { }; extern const uint64_t kPlainTableMagicNumber; -PlainTableReader::PlainTableReader(const EnvOptions& storage_options, - const InternalKeyComparator& icomparator, - uint64_t file_size, int bloom_bits_per_key, - double hash_table_ratio, - const TableProperties* table_properties) - : soptions_(storage_options), +PlainTableReader::PlainTableReader( + const Options& options, unique_ptr&& file, + const EnvOptions& storage_options, const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, const TableProperties* table_properties) + : options_(options), + soptions_(storage_options), + file_(std::move(file)), internal_comparator_(icomparator), file_size_(file_size), kHashTableRatio(hash_table_ratio), kBloomBitsPerKey(bloom_bits_per_key), + kIndexIntervalForSamePrefixKeys(index_sparseness), table_properties_(table_properties), data_end_offset_(table_properties_->data_size), - user_key_len_(table_properties->fixed_key_len) {} + user_key_len_(table_properties->fixed_key_len) { + assert(kHashTableRatio >= 0.0); +} PlainTableReader::~PlainTableReader() { - delete[] hash_table_; - delete[] sub_index_; - delete bloom_; -} - -Status PlainTableReader::Open(const Options& options, - const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table_reader, - const int bloom_bits_per_key, - double hash_table_ratio) { +} + +Status PlainTableReader::Open( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader, const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { assert(options.allow_mmap_reads); if (file_size > kMaxFileSize) { @@ -124,11 +132,9 @@ Status PlainTableReader::Open(const Options& options, return s; } - std::unique_ptr new_reader( - new PlainTableReader(soptions, internal_comparator, file_size, - bloom_bits_per_key, hash_table_ratio, props)); - new_reader->file_ = std::move(file); - new_reader->options_ = options; + std::unique_ptr new_reader(new PlainTableReader( + options, std::move(file), soptions, internal_comparator, file_size, + bloom_bits_per_key, hash_table_ratio, index_sparseness, props)); // -- Populate Index s = new_reader->PopulateIndex(); @@ -148,7 +154,7 @@ bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) { } Iterator* PlainTableReader::NewIterator(const ReadOptions& options) { - return new PlainTableIterator(this); + return new PlainTableIterator(this, options.prefix_seek); } struct PlainTableReader::IndexRecord { @@ -197,6 +203,9 @@ class PlainTableReader::IndexRecordList { return result; } + // Each group in `groups_` contains fix-sized records (determined by + // kNumRecordsPerGroup). Which can help us minimize the cost if resizing + // occurs. const size_t kNumRecordsPerGroup; IndexRecord* current_group_; // List of arrays allocated @@ -204,79 +213,96 @@ class PlainTableReader::IndexRecordList { size_t num_records_in_current_group_; }; -int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) { +Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes) const { Slice prev_key_prefix_slice; uint32_t prev_key_prefix_hash = 0; uint32_t pos = data_start_offset_; - int key_index_within_prefix = 0; + int num_keys_per_prefix = 0; bool is_first_record = true; HistogramImpl keys_per_prefix_hist; // Need map to be ordered to make sure sub indexes generated // are in order. - int num_prefixes = 0; + *num_prefixes = 0; while (pos < data_end_offset_) { uint32_t key_offset = pos; ParsedInternalKey key; Slice value_slice; - status_ = Next(pos, &key, &value_slice, pos); + Status s = Next(&pos, &key, &value_slice); + if (!s.ok()) { + return s; + } + if (bloom_) { + // total order mode and bloom filter is enabled. + bloom_->AddHash(GetSliceHash(key.user_key)); + } Slice key_prefix_slice = GetPrefix(key); if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { - ++num_prefixes; + ++(*num_prefixes); if (!is_first_record) { - keys_per_prefix_hist.Add(key_index_within_prefix); + keys_per_prefix_hist.Add(num_keys_per_prefix); } - key_index_within_prefix = 0; + num_keys_per_prefix = 0; prev_key_prefix_slice = key_prefix_slice; prev_key_prefix_hash = GetSliceHash(key_prefix_slice); } - if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { + if (kIndexIntervalForSamePrefixKeys == 0 || + num_keys_per_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { // Add an index key for every kIndexIntervalForSamePrefixKeys keys record_list->AddRecord(prev_key_prefix_hash, key_offset); } is_first_record = false; } - keys_per_prefix_hist.Add(key_index_within_prefix); + keys_per_prefix_hist.Add(num_keys_per_prefix); Log(options_.info_log, "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist.ToString().c_str()); - return num_prefixes; + return Status::OK(); } void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { - delete[] hash_table_; + index_.reset(); + + if (options_.prefix_extractor != nullptr) { + uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; + if (bloom_total_bits > 0) { + bloom_.reset(new DynamicBloom(bloom_total_bits)); + } + } - if (kBloomBitsPerKey > 0) { - bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey); + if (options_.prefix_extractor == nullptr || kHashTableRatio <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / kHashTableRatio; + index_size_ = num_prefixes * hash_table_size_multipier + 1; } - double hash_table_size_multipier = - (kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio; - hash_table_size_ = num_prefixes * hash_table_size_multipier + 1; - hash_table_ = new uint32_t[hash_table_size_]; + index_.reset(new uint32_t[index_size_]); } size_t PlainTableReader::BucketizeIndexesAndFillBloom( - IndexRecordList& record_list, int num_prefixes, - std::vector* hash_to_offsets, + IndexRecordList* record_list, std::vector* hash_to_offsets, std::vector* bucket_count) { size_t sub_index_size_needed = 0; bool first = true; uint32_t prev_hash = 0; - size_t num_records = record_list.GetNumRecords(); + size_t num_records = record_list->GetNumRecords(); for (size_t i = 0; i < num_records; i++) { - IndexRecord* index_record = record_list.At(i); + IndexRecord* index_record = record_list->At(i); uint32_t cur_hash = index_record->hash; if (first || prev_hash != cur_hash) { prev_hash = cur_hash; first = false; - if (bloom_) { + if (bloom_ && !IsTotalOrderMode()) { bloom_->AddHash(cur_hash); } } - uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_); + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; index_record->next = prev_bucket_head; (*hash_to_offsets)[bucket] = index_record; @@ -306,27 +332,24 @@ void PlainTableReader::FillIndexes( size_t buffer_size = 8 * 8; size_t buffer_used = 0; sub_index_size_needed += buffer_size; - sub_index_ = new char[sub_index_size_needed]; + sub_index_.reset(new char[sub_index_size_needed]); size_t sub_index_offset = 0; - char* prev_ptr; - char* cur_ptr; - uint32_t* sub_index_ptr; - for (int i = 0; i < hash_table_size_; i++) { + for (int i = 0; i < index_size_; i++) { uint32_t num_keys_for_bucket = bucket_count[i]; switch (num_keys_for_bucket) { case 0: // No key for bucket - hash_table_[i] = data_end_offset_; + index_[i] = data_end_offset_; break; case 1: // point directly to the file offset - hash_table_[i] = hash_to_offsets[i]->offset; + index_[i] = hash_to_offsets[i]->offset; break; default: // point to second level indexes. - hash_table_[i] = sub_index_offset | kSubIndexMask; - prev_ptr = sub_index_ + sub_index_offset; - cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + index_[i] = sub_index_offset | kSubIndexMask; + char* prev_ptr = &sub_index_[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); sub_index_offset += (cur_ptr - prev_ptr); if (cur_ptr - prev_ptr > 2 || (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) { @@ -339,17 +362,16 @@ void PlainTableReader::FillIndexes( sub_index_size_needed += buffer_size; buffer_size *= 2; char* new_sub_index = new char[sub_index_size_needed]; - memcpy(new_sub_index, sub_index_, sub_index_offset); - delete[] sub_index_; - sub_index_ = new_sub_index; + memcpy(new_sub_index, sub_index_.get(), sub_index_offset); + sub_index_.reset(new_sub_index); } } - sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset); + char* sub_index_pos = &sub_index_[sub_index_offset]; IndexRecord* record = hash_to_offsets[i]; int j; for (j = num_keys_for_bucket - 1; j >= 0 && record; j--, record = record->next) { - sub_index_ptr[j] = record->offset; + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); } assert(j == -1 && record == nullptr); sub_index_offset += kOffsetLen * num_keys_for_bucket; @@ -358,10 +380,16 @@ void PlainTableReader::FillIndexes( } Log(options_.info_log, "hash table size: %d, suffix_map length %zu", - hash_table_size_, sub_index_size_needed); + index_size_, sub_index_size_needed); } Status PlainTableReader::PopulateIndex() { + // options.prefix_extractor is requried for a hash-based look-up. + if (options_.prefix_extractor == nullptr && kHashTableRatio != 0) { + return Status::NotSupported( + "PlainTable requires a prefix extractor enable prefix hash mode."); + } + // Get mmapped memory to file_data_. Status s = file_->Read(0, file_size_, &file_data_, nullptr); if (!s.ok()) { @@ -373,17 +401,30 @@ Status PlainTableReader::PopulateIndex() { // for a prefix (starting from the first one), generate a record of (hash, // offset) and append it to IndexRecordList, which is a data structure created // to store them. - int num_prefixes = PopulateIndexRecordList(&record_list); + int num_prefixes; + + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; + if (num_bloom_bits > 0) { + bloom_.reset(new DynamicBloom(num_bloom_bits)); + } + } + + s = PopulateIndexRecordList(&record_list, &num_prefixes); + if (!s.ok()) { + return s; + } // Calculated hash table and bloom filter size and allocate memory for indexes // and bloom filter based on the number of prefixes. AllocateIndexAndBloom(num_prefixes); // Bucketize all the index records to a temp data structure, in which for // each bucket, we generate a linked list of IndexRecord, in reversed order. - std::vector hash_to_offsets(hash_table_size_, nullptr); - std::vector bucket_count(hash_table_size_, 0); + std::vector hash_to_offsets(index_size_, nullptr); + std::vector bucket_count(index_size_, 0); size_t sub_index_size_needed = BucketizeIndexesAndFillBloom( - record_list, num_prefixes, &hash_to_offsets, &bucket_count); + &record_list, &hash_to_offsets, &bucket_count); // From the temp data structure, populate indexes. FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count); @@ -392,16 +433,16 @@ Status PlainTableReader::PopulateIndex() { Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t prefix_hash, bool& prefix_matched, - uint32_t& ret_offset) { + uint32_t* offset) const { prefix_matched = false; - int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_); - uint32_t bucket_value = hash_table_[bucket]; + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + uint32_t bucket_value = index_[bucket]; if (bucket_value == data_end_offset_) { - ret_offset = data_end_offset_; + *offset = data_end_offset_; return Status::OK(); } else if ((bucket_value & kSubIndexMask) == 0) { // point directly to the file - ret_offset = bucket_value; + *offset = bucket_value; return Status::OK(); } @@ -409,11 +450,9 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t low = 0; uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask; - const char* index_ptr = sub_index_ + prefix_index_offset; + const char* index_ptr = &sub_index_[prefix_index_offset]; uint32_t upper_bound = 0; - const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr, - index_ptr + 4, - &upper_bound); + const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound); uint32_t high = upper_bound; ParsedInternalKey mid_key; ParsedInternalKey parsed_target; @@ -424,9 +463,9 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, // The key is between [low, high). Do a binary search between it. while (high - low > 1) { uint32_t mid = (high + low) / 2; - uint32_t file_offset = base_ptr[mid]; + uint32_t file_offset = GetFixed32Element(base_ptr, mid); size_t tmp; - Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp); + Status s = ReadKey(file_data_.data() + file_offset, &mid_key, &tmp); if (!s.ok()) { return s; } @@ -438,7 +477,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, // Happen to have found the exact key or target is smaller than the // first key after base_offset. prefix_matched = true; - ret_offset = file_offset; + *offset = file_offset; return Status::OK(); } else { high = mid; @@ -450,48 +489,48 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, // to the wrong prefix. ParsedInternalKey low_key; size_t tmp; - uint32_t low_key_offset = base_ptr[low]; - Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp); + uint32_t low_key_offset = GetFixed32Element(base_ptr, low); + Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, &tmp); if (GetPrefix(low_key) == prefix) { prefix_matched = true; - ret_offset = low_key_offset; + *offset = low_key_offset; } else if (low + 1 < upper_bound) { // There is possible a next prefix, return it prefix_matched = false; - ret_offset = base_ptr[low + 1]; + *offset = GetFixed32Element(base_ptr, low + 1); } else { // target is larger than a key of the last prefix in this bucket // but with a different prefix. Key does not exist. - ret_offset = data_end_offset_; + *offset = data_end_offset_; } return Status::OK(); } -bool PlainTableReader::MayHavePrefix(uint32_t hash) { +bool PlainTableReader::MatchBloom(uint32_t hash) const { return bloom_ == nullptr || bloom_->MayContainHash(hash); } -Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) { - return options_.prefix_extractor->Transform(target.user_key); +Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); } -Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, - size_t& bytes_read) { +Status PlainTableReader::ReadKey(const char* start, ParsedInternalKey* key, + size_t* bytes_read) const { const char* key_ptr = nullptr; - bytes_read = 0; + *bytes_read = 0; size_t user_key_size = 0; if (IsFixedLength()) { user_key_size = user_key_len_; - key_ptr = row_ptr; + key_ptr = start; } else { uint32_t tmp_size = 0; - key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_, - &tmp_size); + key_ptr = + GetVarint32Ptr(start, file_data_.data() + data_end_offset_, &tmp_size); if (key_ptr == nullptr) { return Status::Corruption("Unable to read the next key"); } user_key_size = (size_t)tmp_size; - bytes_read = key_ptr - row_ptr; + *bytes_read = key_ptr - start; } if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) { return Status::Corruption("Unable to read the next key"); @@ -502,43 +541,42 @@ Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, key->user_key = Slice(key_ptr, user_key_size); key->sequence = 0; key->type = kTypeValue; - bytes_read += user_key_size + 1; + *bytes_read += user_key_size + 1; } else { - if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) { + if (start + user_key_size + 8 >= file_data_.data() + data_end_offset_) { return Status::Corruption("Unable to read the next key"); } if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) { return Status::Corruption(Slice()); } - bytes_read += user_key_size + 8; + *bytes_read += user_key_size + 8; } return Status::OK(); } -Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key, - Slice* value, uint32_t& next_offset) { - if (offset == data_end_offset_) { - next_offset = data_end_offset_; +Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key, + Slice* value) const { + if (*offset == data_end_offset_) { + *offset = data_end_offset_; return Status::OK(); } - if (offset > data_end_offset_) { + if (*offset > data_end_offset_) { return Status::Corruption("Offset is out of file size"); } - const char* row_ptr = file_data_.data() + offset; + const char* start = file_data_.data() + *offset; size_t bytes_for_key; - Status s = ReadKey(row_ptr, key, bytes_for_key); + Status s = ReadKey(start, key, &bytes_for_key); uint32_t value_size; - const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key, - file_data_.data() + data_end_offset_, - &value_size); + const char* value_ptr = GetVarint32Ptr( + start + bytes_for_key, file_data_.data() + data_end_offset_, &value_size); if (value_ptr == nullptr) { return Status::Corruption("Error reading value length."); } - next_offset = offset + (value_ptr - row_ptr) + value_size; - if (next_offset > data_end_offset_) { + *offset = *offset + (value_ptr - start) + value_size; + if (*offset > data_end_offset_) { return Status::Corruption("Reach end of file when reading value"); } *value = Slice(value_ptr, value_size); @@ -552,14 +590,28 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, const Slice&, bool), void (*mark_key_may_exist)(void*)) { // Check bloom filter first. - Slice prefix_slice = GetPrefix(target); - uint32_t prefix_hash = GetSliceHash(prefix_slice); - if (!MayHavePrefix(prefix_hash)) { - return Status::OK(); + Slice prefix_slice; + uint32_t prefix_hash; + if (IsTotalOrderMode()) { + // Match whole user key for bloom filter check. + if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { + return Status::OK(); + } + // in total order mode, there is only one bucket 0, and we always use empty + // prefix. + prefix_slice = Slice(); + prefix_hash = 0; + } else { + prefix_slice = GetPrefix(target); + prefix_hash = GetSliceHash(prefix_slice); + if (!MatchBloom(prefix_hash)) { + return Status::OK(); + } } uint32_t offset; bool prefix_match; - Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset); + Status s = + GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset); if (!s.ok()) { return s; } @@ -571,7 +623,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, Slice found_value; while (offset < data_end_offset_) { - Status s = Next(offset, &found_key, &found_value, offset); + Status s = Next(&offset, &found_key, &found_value); if (!s.ok()) { return s; } @@ -596,8 +648,9 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { return 0; } -PlainTableIterator::PlainTableIterator(PlainTableReader* table) : - table_(table) { +PlainTableIterator::PlainTableIterator(PlainTableReader* table, + bool use_prefix_seek) + : table_(table), use_prefix_seek_(use_prefix_seek) { next_offset_ = offset_ = table_->data_end_offset_; } @@ -620,18 +673,39 @@ void PlainTableIterator::SeekToFirst() { void PlainTableIterator::SeekToLast() { assert(false); + status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); } void PlainTableIterator::Seek(const Slice& target) { - Slice prefix_slice = table_->GetPrefix(target); - uint32_t prefix_hash = GetSliceHash(prefix_slice); - if (!table_->MayHavePrefix(prefix_hash)) { + // If the user doesn't set prefix seek option and we are not able to do a + // total Seek(). assert failure. + if (!use_prefix_seek_ && table_->index_size_ > 1) { + assert(false); + status_ = Status::NotSupported( + "PlainTable cannot issue non-prefix seek unless in total order mode."); + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash; + uint32_t bloom_hash; + if (table_->IsTotalOrderMode()) { + // The total order mode, there is only one hash bucket 0. The bloom filter + // is checked against the whole user key. + prefix_hash = 0; + bloom_hash = GetSliceHash(table_->GetUserKey(target)); + } else { + prefix_hash = GetSliceHash(prefix_slice); + bloom_hash = prefix_hash; + } + if (!table_->MatchBloom(bloom_hash)) { offset_ = next_offset_ = table_->data_end_offset_; return; } bool prefix_match; status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match, - next_offset_); + &next_offset_); if (!status_.ok()) { offset_ = next_offset_ = table_->data_end_offset_; return; @@ -661,7 +735,7 @@ void PlainTableIterator::Next() { if (offset_ < table_->data_end_offset_) { Slice tmp_slice; ParsedInternalKey parsed_key; - status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_); + status_ = table_->Next(&next_offset_, &parsed_key, &value_); if (status_.ok()) { // Make a copy in this case. TODO optimize. tmp_str_.clear(); diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index dd7b1e50f..16bbc8ba5 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -49,7 +49,8 @@ class PlainTableReader: public TableReader { const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table, - const int bloom_bits_per_key, double hash_table_ratio); + const int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness); bool PrefixMayMatch(const Slice& internal_prefix); @@ -68,36 +69,97 @@ class PlainTableReader: public TableReader { return table_properties_; } - PlainTableReader(const EnvOptions& storage_options, + PlainTableReader(const Options& options, unique_ptr&& file, + const EnvOptions& storage_options, const InternalKeyComparator& internal_comparator, uint64_t file_size, int bloom_num_bits, - double hash_table_ratio, + double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties); - ~PlainTableReader(); + virtual ~PlainTableReader(); + + protected: + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + virtual bool MatchBloom(uint32_t hash) const; + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // index_ contains buckets size of index_size_, each is a + // 32-bit integer. The lower 31 bits contain an offset value (explained below) + // and the first bit of the integer indicates type of the offset. + // + // +--------------+------------------------------------------------------+ + // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + + // +--------------+------------------------------------------------------+ + // + // Explanation for the "flag bit": + // + // 0 indicates that the bucket contains only one prefix (no conflict when + // hashing this prefix), whose first row starts from this offset of the + // file. + // 1 indicates that the bucket contains more than one prefixes, or there + // are too many rows for one prefix so we need a binary search for it. In + // this case, the offset indicates the offset of sub_index_ holding the + // binary search indexes of keys for those rows. Those binary search indexes + // are organized in this way: + // + // The first 4 bytes, indicate how many indexes (N) are stored after it. After + // it, there are N 32-bit integers, each points of an offset of the file, + // which + // points to starting of a row. Those offsets need to be guaranteed to be in + // ascending order so the keys they are pointing to are also in ascending + // order + // to make sure we can use them to do binary searches. Below is visual + // presentation of a bucket. + // + // + // number_of_records: varint32 + // record 1 file offset: fixedint32 + // record 2 file offset: fixedint32 + // .... + // record N file offset: fixedint32 + // + Status PopulateIndex(); private: struct IndexRecord; class IndexRecordList; - uint32_t* hash_table_ = nullptr; - int hash_table_size_ = 0; - char* sub_index_ = nullptr; + // Plain table maintains an index and a sub index. + // index is implemented by a hash table. + // subindex is a big of memory array. + // For more details about the in-memory index, please refer to: + // https://github.com/facebook/rocksdb/wiki/PlainTable-Format + // #wiki-in-memory-index-format + std::unique_ptr index_; + int index_size_ = 0; + std::unique_ptr sub_index_; Options options_; const EnvOptions& soptions_; + unique_ptr file_; + const InternalKeyComparator internal_comparator_; + // represents plain table's current status. Status status_; - unique_ptr file_; Slice file_data_; - uint32_t version_; uint32_t file_size_; const double kHashTableRatio; const int kBloomBitsPerKey; - DynamicBloom* bloom_ = nullptr; + // To speed up the search for keys with same prefix, we'll add index key for + // every N keys, where the "N" is determined by + // kIndexIntervalForSamePrefixKeys + const size_t kIndexIntervalForSamePrefixKeys = 16; + // Bloom filter is used to rule out non-existent key + unique_ptr bloom_; std::shared_ptr table_properties_; + // data_start_offset_ and data_end_offset_ defines the range of the + // sst file that stores data. const uint32_t data_start_offset_ = 0; const uint32_t data_end_offset_; const size_t user_key_len_; @@ -107,10 +169,6 @@ class PlainTableReader: public TableReader { static const size_t kOffsetLen = sizeof(uint32_t); static const uint64_t kMaxFileSize = 1u << 31; static const size_t kRecordsPerGroup = 256; - // To speed up the search for keys with same prefix, we'll add index key for - // every N keys, where the "N" is determined by - // kIndexIntervalForSamePrefixKeys - static const size_t kIndexIntervalForSamePrefixKeys = 16; bool IsFixedLength() const { return user_key_len_ != kPlainTableVariableLength; @@ -125,95 +183,72 @@ class PlainTableReader: public TableReader { // Internal helper function to generate an IndexRecordList object from all // the rows, which contains index records as a list. - int PopulateIndexRecordList(IndexRecordList* record_list); + // If bloom_ is not null, all the keys' full-key hash will be added to the + // bloom filter. + Status PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes) const; // Internal helper function to allocate memory for indexes and bloom filters void AllocateIndexAndBloom(int num_prefixes); // Internal helper function to bucket index record list to hash buckets. - // hash_to_offsets is sized of of hash_table_size_, each contains a linked - // list + // bucket_header is a vector of size hash_table_size_, with each entry + // containing a linklist of IndexRecord hashed to the same bucket, in reverse + // order. // of offsets for the hash, in reversed order. - // bucket_count is sized of hash_table_size_. The value is how many index - // records are there in hash_to_offsets for the same bucket. - size_t BucketizeIndexesAndFillBloom( - IndexRecordList& record_list, int num_prefixes, - std::vector* hash_to_offsets, - std::vector* bucket_count); + // bucket_count is sized of index_size_. The value is how many index + // records are there in bucket_headers for the same bucket. + size_t BucketizeIndexesAndFillBloom(IndexRecordList* record_list, + std::vector* bucket_headers, + std::vector* bucket_count); // Internal helper class to fill the indexes and bloom filters to internal - // data structures. hash_to_offsets and bucket_count are bucketized indexes + // data structures. bucket_headers and bucket_count are bucketized indexes // and counts generated by BucketizeIndexesAndFillBloom(). void FillIndexes(size_t sub_index_size_needed, - const std::vector& hash_to_offsets, + const std::vector& bucket_headers, const std::vector& bucket_count); - // PopulateIndex() builds index of keys. It must be called before any query - // to the table. - // - // hash_table_ contains buckets size of hash_table_size_, each is a 32-bit - // integer. The lower 31 bits contain an offset value (explained below) and - // the first bit of the integer indicates type of the offset. - // - // +--------------+------------------------------------------------------+ - // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + - // +--------------+------------------------------------------------------+ - // - // Explanation for the "flag bit": - // - // 0 indicates that the bucket contains only one prefix (no conflict when - // hashing this prefix), whose first row starts from this offset of the - // file. - // 1 indicates that the bucket contains more than one prefixes, or there - // are too many rows for one prefix so we need a binary search for it. In - // this case, the offset indicates the offset of sub_index_ holding the - // binary search indexes of keys for those rows. Those binary search indexes - // are organized in this way: - // - // The first 4 bytes, indicate how many indexes (N) are stored after it. After - // it, there are N 32-bit integers, each points of an offset of the file, - // which - // points to starting of a row. Those offsets need to be guaranteed to be in - // ascending order so the keys they are pointing to are also in ascending - // order - // to make sure we can use them to do binary searches. Below is visual - // presentation of a bucket. - // - // - // number_of_records: varint32 - // record 1 file offset: fixedint32 - // record 2 file offset: fixedint32 - // .... - // record N file offset: fixedint32 - // - Status PopulateIndex(); - - // Check bloom filter to see whether it might contain this prefix. - // The hash of the prefix is given, since it can be reused for index lookup - // too. - bool MayHavePrefix(uint32_t hash); - + // Read a plain table key from the position `start`. The read content + // will be written to `key` and the size of read bytes will be populated + // in `bytes_read`. Status ReadKey(const char* row_ptr, ParsedInternalKey* key, - size_t& bytes_read); - // Read the key and value at offset to key and value. - // tmp_slice is a tmp slice. - // return next_offset as the offset for the next key. - Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value, - uint32_t& next_offset); + size_t* bytes_read) const; + // Read the key and value at `offset` to parameters `key` and `value`. + // On success, `offset` will be updated as the offset for the next key. + Status Next(uint32_t* offset, ParsedInternalKey* key, Slice* value) const; // Get file offset for key target. // return value prefix_matched is set to true if the offset is confirmed // for a key with the same prefix as target. Status GetOffset(const Slice& target, const Slice& prefix, uint32_t prefix_hash, bool& prefix_matched, - uint32_t& ret_offset); + uint32_t* offset) const; + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } - Slice GetPrefix(const Slice& target) { + Slice GetPrefix(const Slice& target) const { assert(target.size() >= 8); // target is internal key - return options_.prefix_extractor->Transform( - Slice(target.data(), target.size() - 8)); + return GetPrefixFromUserKey(GetUserKey(target)); } - Slice GetPrefix(const ParsedInternalKey& target); + inline Slice GetPrefix(const ParsedInternalKey& target) const; + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return options_.prefix_extractor->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. In that case, + // it falls back to pure binary search and total iterator seek is + // supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { + return (options_.prefix_extractor == nullptr); + } // No copying allowed explicit PlainTableReader(const TableReader&) = delete; diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index f746592fe..0d070a14e 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -13,6 +13,7 @@ #include "port/atomic_pointer.h" #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" +#include "table/table_builder.h" #include "util/histogram.h" #include "util/testharness.h" #include "util/testutil.h" @@ -39,6 +40,10 @@ static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey, return false; } +uint64_t Now(Env* env, bool measured_by_nanosecond) { + return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); +} + // A very simple benchmark that. // Create a table with roughly numKey1 * numKey2 keys, // where there are numKey1 prefixes of the key, each has numKey2 number of @@ -56,13 +61,14 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, ReadOptions& read_options, int num_keys1, int num_keys2, int num_iter, int prefix_len, bool if_query_empty_keys, bool for_iterator, - bool through_db) { + bool through_db, bool measured_by_nanosecond) { + rocksdb::InternalKeyComparator ikc(opts.comparator); + Slice prefix = Slice(); std::string file_name = test::TmpDir() + "/rocksdb_table_reader_benchmark"; std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db"; - ReadOptions ro; WriteOptions wo; unique_ptr file; Env* env = Env::Default(); @@ -71,7 +77,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, Status s; if (!through_db) { env->NewWritableFile(file_name, &file, env_options); - tb = opts.table_factory->NewTableBuilder(opts, file.get(), + tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(), CompressionType::kNoCompression); } else { s = DB::Open(opts, dbname, &db); @@ -102,8 +108,8 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, Status s = env->NewRandomAccessFile(file_name, &raf, env_options); uint64_t file_size; env->GetFileSize(file_name, &file_size); - s = opts.table_factory->NewTableReader(opts, env_options, std::move(raf), - file_size, &table_reader); + s = opts.table_factory->NewTableReader( + opts, env_options, ikc, std::move(raf), file_size, &table_reader); } Random rnd(301); @@ -124,15 +130,16 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, if (!for_iterator) { // Query one existing key; std::string key = MakeKey(r1, r2, through_db); - uint64_t start_micros = env->NowMicros(); + uint64_t start_time = Now(env, measured_by_nanosecond); port::MemoryBarrier(); if (!through_db) { - s = table_reader->Get(ro, key, arg, DummySaveValue, nullptr); + s = table_reader->Get(read_options, key, arg, DummySaveValue, + nullptr); } else { - s = db->Get(ro, key, &result); + s = db->Get(read_options, key, &result); } port::MemoryBarrier(); - hist.Add(env->NowMicros() - start_micros); + hist.Add(Now(env, measured_by_nanosecond) - start_time); } else { int r2_len; if (if_query_empty_keys) { @@ -150,7 +157,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, read_options.prefix = &prefix; } uint64_t total_time = 0; - uint64_t start_micros = env->NowMicros(); + uint64_t start_time = Now(env, measured_by_nanosecond); port::MemoryBarrier(); Iterator* iter; if (!through_db) { @@ -165,9 +172,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } // verify key; port::MemoryBarrier(); - total_time += env->NowMicros() - start_micros; + total_time += Now(env, measured_by_nanosecond) - start_time; assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key()); - start_micros = env->NowMicros(); + start_time = Now(env, measured_by_nanosecond); if (++count >= r2_len) { break; } @@ -180,7 +187,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } delete iter; port::MemoryBarrier(); - total_time += env->NowMicros() - start_micros; + total_time += Now(env, measured_by_nanosecond) - start_time; hist.Add(total_time); } } @@ -195,9 +202,10 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, "num_key2: %5d %10s\n" "===================================================" "====================================================" - "\nHistogram (unit: microseconds): \n%s", + "\nHistogram (unit: %s): \n%s", opts.table_factory->Name(), num_keys1, num_keys2, - for_iterator? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"), + for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"), + measured_by_nanosecond ? "nanosecond" : "microsecond", hist.ToString().c_str()); if (!through_db) { env->DeleteFile(file_name); @@ -207,7 +215,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, DestroyDB(dbname, opts); } } -} // namespace rocksdb +} // namespace rocksdb DEFINE_bool(query_empty, false, "query non-existing keys instead of existing " "ones."); @@ -220,7 +228,9 @@ DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " "the query will be against DB. Otherwise, will be directly against " "a table reader."); DEFINE_bool(plain_table, false, "Use PlainTable"); - +DEFINE_string(time_unit, "microsecond", + "The time unit used for measuring performance. User can specify " + "`microsecond` (default) or `nanosecond`"); int main(int argc, char** argv) { google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + @@ -237,10 +247,9 @@ int main(int argc, char** argv) { rocksdb::EnvOptions env_options; options.create_if_missing = true; options.compression = rocksdb::CompressionType::kNoCompression; - options.internal_comparator = - new rocksdb::InternalKeyComparator(options.comparator); if (FLAGS_plain_table) { + ro.prefix_seek = true; options.allow_mmap_reads = true; env_options.use_mmap_reads = true; tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8, @@ -250,11 +259,15 @@ int main(int argc, char** argv) { } else { tf = new rocksdb::BlockBasedTableFactory(); } + // if user provides invalid options, just fall back to microsecond. + bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond"; + options.table_factory = std::shared_ptr(tf); TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len, - FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db); + FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db, + measured_by_nanosecond); delete tf; return 0; } diff --git a/table/table_test.cc b/table/table_test.cc index d31cb8396..9f6efbf50 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -306,8 +306,11 @@ class KeyConvertingIterator: public Iterator { class TableConstructor: public Constructor { public: explicit TableConstructor(const Comparator* cmp, - bool convert_to_internal_key = false) - : Constructor(cmp), convert_to_internal_key_(convert_to_internal_key) {} + bool convert_to_internal_key = false, + bool prefix_seek = false) + : Constructor(cmp), + convert_to_internal_key_(convert_to_internal_key), + prefix_seek_(prefix_seek) {} ~TableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, @@ -347,7 +350,11 @@ class TableConstructor: public Constructor { } virtual Iterator* NewIterator() const { - Iterator* iter = table_reader_->NewIterator(ReadOptions()); + ReadOptions ro; + if (prefix_seek_) { + ro.prefix_seek = true; + } + Iterator* iter = table_reader_->NewIterator(ro); if (convert_to_internal_key_) { return new KeyConvertingIterator(iter); } else { @@ -380,6 +387,7 @@ class TableConstructor: public Constructor { source_.reset(); } bool convert_to_internal_key_; + bool prefix_seek_; uint64_t uniq_id_; unique_ptr sink_; @@ -549,6 +557,7 @@ enum TestType { BLOCK_BASED_TABLE_TEST, PLAIN_TABLE_SEMI_FIXED_PREFIX, PLAIN_TABLE_FULL_STR_PREFIX, + PLAIN_TABLE_TOTAL_ORDER, BLOCK_TEST, MEMTABLE_TEST, DB_TEST @@ -565,8 +574,9 @@ static std::vector GenerateArgList() { std::vector test_args; std::vector test_types = { BLOCK_BASED_TABLE_TEST, PLAIN_TABLE_SEMI_FIXED_PREFIX, - PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST, - MEMTABLE_TEST, DB_TEST}; + PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER, + BLOCK_TEST, MEMTABLE_TEST, + DB_TEST}; std::vector reverse_compare_types = {false, true}; std::vector restart_intervals = {16, 1, 1024}; @@ -689,8 +699,8 @@ class Harness { only_support_prefix_seek_ = true; options_.prefix_extractor = prefix_transform.get(); options_.allow_mmap_reads = true; - options_.table_factory.reset(new PlainTableFactory()); - constructor_ = new TableConstructor(options_.comparator, true); + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, true); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -699,8 +709,18 @@ class Harness { only_support_prefix_seek_ = true; options_.prefix_extractor = noop_transform.get(); options_.allow_mmap_reads = true; - options_.table_factory.reset(new PlainTableFactory()); - constructor_ = new TableConstructor(options_.comparator, true); + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_TOTAL_ORDER: + support_prev_ = false; + only_support_prefix_seek_ = false; + options_.prefix_extractor = nullptr; + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewTotalOrderPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, false); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break;