diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 17e3e61d8..19cf56661 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -184,10 +184,11 @@ class TestPlainTableReader : public PlainTableReader { unique_ptr&& file, const Options& options, bool* expect_bloom_not_match) : PlainTableReader(options, std::move(file), storage_options, icomparator, - file_size, bloom_bits_per_key, hash_table_ratio, - index_sparseness, table_properties, 2 * 1024 * 1024), + file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { - Status s = PopulateIndex(const_cast(table_properties)); + Status s = PopulateIndex(const_cast(table_properties), + bloom_bits_per_key, hash_table_ratio, + index_sparseness, 2 * 1024 * 1024); ASSERT_TRUE(s.ok()); } diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index 22968ef6b..139c30c30 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -92,26 +92,22 @@ class PlainTableIterator : public Iterator { }; extern const uint64_t kPlainTableMagicNumber; -PlainTableReader::PlainTableReader( - const Options& options, unique_ptr&& file, - const EnvOptions& storage_options, const InternalKeyComparator& icomparator, - uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, - size_t index_sparseness, const TableProperties* table_properties, - size_t huge_page_tlb_size) - : options_(options), - soptions_(storage_options), - file_(std::move(file)), - internal_comparator_(icomparator), - file_size_(file_size), - kHashTableRatio(hash_table_ratio), - kBloomBitsPerKey(bloom_bits_per_key), - kIndexIntervalForSamePrefixKeys(index_sparseness), - table_properties_(nullptr), +PlainTableReader::PlainTableReader(const Options& options, + unique_ptr&& file, + const EnvOptions& storage_options, + const InternalKeyComparator& icomparator, + uint64_t file_size, + const TableProperties* table_properties) + : internal_comparator_(icomparator), data_end_offset_(table_properties->data_size), user_key_len_(table_properties->fixed_key_len), - huge_page_tlb_size_(huge_page_tlb_size) { - assert(kHashTableRatio >= 0.0); -} + prefix_extractor_(options.prefix_extractor.get()), + enable_bloom_(false), + bloom_(6, nullptr), + options_(options), + file_(std::move(file)), + file_size_(file_size), + table_properties_(nullptr) {} PlainTableReader::~PlainTableReader() { } @@ -138,13 +134,14 @@ Status PlainTableReader::Open(const Options& options, return s; } - std::unique_ptr new_reader(new PlainTableReader( - options, std::move(file), soptions, internal_comparator, file_size, - bloom_bits_per_key, hash_table_ratio, index_sparseness, props, - huge_page_tlb_size)); + assert(hash_table_ratio >= 0.0); + std::unique_ptr new_reader( + new PlainTableReader(options, std::move(file), soptions, + internal_comparator, file_size, props)); // -- Populate Index - s = new_reader->PopulateIndex(props); + s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio, + index_sparseness, huge_page_tlb_size); if (!s.ok()) { return s; } @@ -224,7 +221,9 @@ class PlainTableReader::IndexRecordList { }; Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, - int* num_prefixes) const { + int* num_prefixes, + int bloom_bits_per_key, + size_t index_sparseness) { Slice prev_key_prefix_slice; uint32_t prev_key_prefix_hash = 0; uint32_t pos = data_start_offset_; @@ -243,9 +242,9 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, if (!s.ok()) { return s; } - if (bloom_) { + if (enable_bloom_) { // total order mode and bloom filter is enabled. - bloom_->AddHash(GetSliceHash(key.user_key)); + bloom_.AddHash(GetSliceHash(key.user_key)); } Slice key_prefix_slice = GetPrefix(key); @@ -259,8 +258,8 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, prev_key_prefix_hash = GetSliceHash(key_prefix_slice); } - if (kIndexIntervalForSamePrefixKeys == 0 || - num_keys_per_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { + if (index_sparseness == 0 || + num_keys_per_prefix++ % index_sparseness == 0) { // Add an index key for every kIndexIntervalForSamePrefixKeys keys record_list->AddRecord(prev_key_prefix_hash, key_offset); } @@ -274,22 +273,25 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, return Status::OK(); } -void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { - if (options_.prefix_extractor.get() != nullptr) { - uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; +void PlainTableReader::AllocateIndexAndBloom(int num_prefixes, + int bloom_bits_per_key, + double hash_table_ratio, + size_t huge_page_tlb_size) { + if (prefix_extractor_ != nullptr) { + uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; if (bloom_total_bits > 0) { - bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality, - 6, nullptr, huge_page_tlb_size_, - options_.info_log.get())); + enable_bloom_ = true; + bloom_.SetTotalBits(bloom_total_bits, options_.bloom_locality, + huge_page_tlb_size, options_.info_log.get()); } } - if (options_.prefix_extractor.get() == nullptr || kHashTableRatio <= 0) { + if (prefix_extractor_ == nullptr || hash_table_ratio <= 0) { // Fall back to pure binary search if the user fails to specify a prefix // extractor. index_size_ = 1; } else { - double hash_table_size_multipier = 1.0 / kHashTableRatio; + double hash_table_size_multipier = 1.0 / hash_table_ratio; index_size_ = num_prefixes * hash_table_size_multipier + 1; } } @@ -306,8 +308,8 @@ size_t PlainTableReader::BucketizeIndexesAndFillBloom( if (first || prev_hash != cur_hash) { prev_hash = cur_hash; first = false; - if (bloom_ && !IsTotalOrderMode()) { - bloom_->AddHash(cur_hash); + if (enable_bloom_ && !IsTotalOrderMode()) { + bloom_.AddHash(cur_hash); } } uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); @@ -332,12 +334,13 @@ size_t PlainTableReader::BucketizeIndexesAndFillBloom( void PlainTableReader::FillIndexes( const size_t kSubIndexSize, const std::vector& hash_to_offsets, - const std::vector& entries_per_bucket) { + const std::vector& entries_per_bucket, + size_t huge_page_tlb_size) { Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", kSubIndexSize); auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize; char* allocated = arena_.AllocateAligned( - total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); + total_allocate_size, huge_page_tlb_size, options_.info_log.get()); index_ = reinterpret_cast(allocated); sub_index_ = allocated + sizeof(uint32_t) * index_size_; @@ -378,12 +381,16 @@ void PlainTableReader::FillIndexes( index_size_, kSubIndexSize); } -Status PlainTableReader::PopulateIndex(TableProperties* props) { +Status PlainTableReader::PopulateIndex(TableProperties* props, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness, + size_t huge_page_tlb_size) { assert(props != nullptr); table_properties_.reset(props); // options.prefix_extractor is requried for a hash-based look-up. - if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) { + if (options_.prefix_extractor.get() == nullptr && hash_table_ratio != 0) { return Status::NotSupported( "PlainTable requires a prefix extractor enable prefix hash mode."); } @@ -403,21 +410,24 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) { // Allocate bloom filter here for total order mode. if (IsTotalOrderMode()) { - uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; + uint32_t num_bloom_bits = + table_properties_->num_entries * bloom_bits_per_key; if (num_bloom_bits > 0) { - bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6, - nullptr, huge_page_tlb_size_, - options_.info_log.get())); + enable_bloom_ = true; + bloom_.SetTotalBits(num_bloom_bits, options_.bloom_locality, + huge_page_tlb_size, options_.info_log.get()); } } - s = PopulateIndexRecordList(&record_list, &num_prefixes); + s = PopulateIndexRecordList(&record_list, &num_prefixes, bloom_bits_per_key, + index_sparseness); if (!s.ok()) { return s; } // Calculated hash table and bloom filter size and allocate memory for indexes // and bloom filter based on the number of prefixes. - AllocateIndexAndBloom(num_prefixes); + AllocateIndexAndBloom(num_prefixes, bloom_bits_per_key, hash_table_ratio, + huge_page_tlb_size); // Bucketize all the index records to a temp data structure, in which for // each bucket, we generate a linked list of IndexRecord, in reversed order. @@ -426,7 +436,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) { size_t sub_index_size_needed = BucketizeIndexesAndFillBloom( &record_list, &hash_to_offsets, &entries_per_bucket); // From the temp data structure, populate indexes. - FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket); + FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket, + huge_page_tlb_size); // Fill two table properties. // TODO(sdong): after we have the feature of storing index in file, this @@ -515,7 +526,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, } bool PlainTableReader::MatchBloom(uint32_t hash) const { - return bloom_.get() == nullptr || bloom_->MayContainHash(hash); + return !enable_bloom_ || bloom_.MayContainHash(hash); } Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const { diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 62239beb3..e20a109c7 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -20,6 +20,7 @@ #include "table/table_reader.h" #include "table/plain_table_factory.h" #include "util/arena.h" +#include "util/dynamic_bloom.h" namespace rocksdb { @@ -31,7 +32,6 @@ class RandomAccessFile; struct ReadOptions; class TableCache; class TableReader; -class DynamicBloom; class InternalKeyComparator; using std::unique_ptr; @@ -73,10 +73,7 @@ class PlainTableReader: public TableReader { PlainTableReader(const Options& options, unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& internal_comparator, - uint64_t file_size, int bloom_num_bits, - double hash_table_ratio, size_t index_sparseness, - const TableProperties* table_properties, - size_t huge_page_tlb_size); + uint64_t file_size, const TableProperties* table_properties); virtual ~PlainTableReader(); protected: @@ -126,7 +123,9 @@ class PlainTableReader: public TableReader { // .... // record N file offset: fixedint32 // - Status PopulateIndex(TableProperties* props); + Status PopulateIndex(TableProperties* props, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + size_t huge_page_tlb_size); private: struct IndexRecord; @@ -141,35 +140,17 @@ class PlainTableReader: public TableReader { uint32_t* index_; int index_size_ = 0; char* sub_index_; - - Options options_; - const EnvOptions& soptions_; - unique_ptr file_; - const InternalKeyComparator internal_comparator_; // represents plain table's current status. Status status_; - Slice file_data_; - uint32_t file_size_; - const double kHashTableRatio; - const int kBloomBitsPerKey; - // To speed up the search for keys with same prefix, we'll add index key for - // every N keys, where the "N" is determined by - // kIndexIntervalForSamePrefixKeys - const size_t kIndexIntervalForSamePrefixKeys = 16; - // Bloom filter is used to rule out non-existent key - unique_ptr bloom_; - Arena arena_; - - std::shared_ptr table_properties_; // data_start_offset_ and data_end_offset_ defines the range of the // sst file that stores data. const uint32_t data_start_offset_ = 0; const uint32_t data_end_offset_; const size_t user_key_len_; - const size_t huge_page_tlb_size_; + const SliceTransform* prefix_extractor_; static const size_t kNumInternalBytes = 8; static const uint32_t kSubIndexMask = 0x80000000; @@ -177,6 +158,16 @@ class PlainTableReader: public TableReader { static const uint64_t kMaxFileSize = 1u << 31; static const size_t kRecordsPerGroup = 256; + // Bloom filter is used to rule out non-existent key + bool enable_bloom_; + DynamicBloom bloom_; + Arena arena_; + + const Options& options_; + unique_ptr file_; + uint32_t file_size_; + std::shared_ptr table_properties_; + bool IsFixedLength() const { return user_key_len_ != kPlainTableVariableLength; } @@ -193,10 +184,13 @@ class PlainTableReader: public TableReader { // If bloom_ is not null, all the keys' full-key hash will be added to the // bloom filter. Status PopulateIndexRecordList(IndexRecordList* record_list, - int* num_prefixes) const; + int* num_prefixes, int bloom_bits_per_key, + size_t index_sparseness); // Internal helper function to allocate memory for indexes and bloom filters - void AllocateIndexAndBloom(int num_prefixes); + void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key, + double hash_table_ratio, + size_t huge_page_tlb_size); // Internal helper function to bucket index record list to hash buckets. // bucket_header is a vector of size hash_table_size_, with each entry @@ -214,7 +208,8 @@ class PlainTableReader: public TableReader { // indexes and counts generated by BucketizeIndexesAndFillBloom(). void FillIndexes(const size_t kSubIndexSize, const std::vector& bucket_headers, - const std::vector& entries_per_bucket); + const std::vector& entries_per_bucket, + size_t huge_page_tlb_size); // Read a plain table key from the position `start`. The read content // will be written to `key` and the size of read bytes will be populated @@ -244,7 +239,7 @@ class PlainTableReader: public TableReader { Slice GetPrefixFromUserKey(const Slice& user_key) const { if (!IsTotalOrderMode()) { - return options_.prefix_extractor->Transform(user_key); + return prefix_extractor_->Transform(user_key); } else { // Use empty slice as prefix if prefix_extractor is not set. In that case, // it falls back to pure binary search and total iterator seek is @@ -253,9 +248,7 @@ class PlainTableReader: public TableReader { } } - bool IsTotalOrderMode() const { - return (options_.prefix_extractor.get() == nullptr); - } + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } // No copying allowed explicit PlainTableReader(const TableReader&) = delete; diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 7173bbb93..b90f199ae 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -18,15 +18,17 @@ static uint32_t BloomHash(const Slice& key) { return Hash(key.data(), key.size(), 0xbc9f1d34); } -uint32_t GetNumBlocks(uint32_t total_bits) { - uint32_t num_blocks = (total_bits + CACHE_LINE_SIZE * 8 - 1) / - (CACHE_LINE_SIZE * 8) * (CACHE_LINE_SIZE * 8); +uint32_t GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_blocks = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + // Make num_blocks an odd number to make sure more bits are involved // when determining which block. if (num_blocks % 2 == 0) { num_blocks++; } - return num_blocks; + + return num_blocks * (CACHE_LINE_SIZE * 8); } } @@ -34,11 +36,23 @@ DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t locality, uint32_t num_probes, uint32_t (*hash_func)(const Slice& key), size_t huge_page_tlb_size, Logger* logger) - : kTotalBits(((locality > 0) ? GetNumBlocks(total_bits) : total_bits + 7) / - 8 * 8), - kNumBlocks((locality > 0) ? kTotalBits / (CACHE_LINE_SIZE * 8) : 0), + : DynamicBloom(num_probes, hash_func) { + SetTotalBits(total_bits, locality, huge_page_tlb_size, logger); +} + +DynamicBloom::DynamicBloom(uint32_t num_probes, + uint32_t (*hash_func)(const Slice& key)) + : kTotalBits(0), + kNumBlocks(0), kNumProbes(num_probes), - hash_func_(hash_func == nullptr ? &BloomHash : hash_func) { + hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {} + +void DynamicBloom::SetTotalBits(uint32_t total_bits, uint32_t locality, + size_t huge_page_tlb_size, Logger* logger) { + kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) + : (total_bits + 7) / 8 * 8; + kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; + assert(kNumBlocks > 0 || kTotalBits > 0); assert(kNumProbes > 0); diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index e59134591..2a0dc8434 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -33,6 +33,12 @@ class DynamicBloom { size_t huge_page_tlb_size = 0, Logger* logger = nullptr); + explicit DynamicBloom(uint32_t num_probes = 6, + uint32_t (*hash_func)(const Slice& key) = nullptr); + + void SetTotalBits(uint32_t total_bits, uint32_t locality, + size_t huge_page_tlb_size, Logger* logger); + ~DynamicBloom() {} // Assuming single threaded access to this function. @@ -42,14 +48,14 @@ class DynamicBloom { void AddHash(uint32_t hash); // Multithreaded access to this function is OK - bool MayContain(const Slice& key); + bool MayContain(const Slice& key) const; // Multithreaded access to this function is OK - bool MayContainHash(uint32_t hash); + bool MayContainHash(uint32_t hash) const; private: - const uint32_t kTotalBits; - const uint32_t kNumBlocks; + uint32_t kTotalBits; + uint32_t kNumBlocks; const uint32_t kNumProbes; uint32_t (*hash_func_)(const Slice& key); @@ -61,11 +67,12 @@ class DynamicBloom { inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); } -inline bool DynamicBloom::MayContain(const Slice& key) { +inline bool DynamicBloom::MayContain(const Slice& key) const { return (MayContainHash(hash_func_(key))); } -inline bool DynamicBloom::MayContainHash(uint32_t h) { +inline bool DynamicBloom::MayContainHash(uint32_t h) const { + assert(kNumBlocks > 0 || kTotalBits > 0); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits if (kNumBlocks != 0) { uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); @@ -82,6 +89,10 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) { h += delta; } } else { + if (kTotalBits == 0) { + // Not initialized. + return true; + } for (uint32_t i = 0; i < kNumProbes; ++i) { const uint32_t bitpos = h % kTotalBits; if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { @@ -94,6 +105,7 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) { } inline void DynamicBloom::AddHash(uint32_t h) { + assert(kNumBlocks > 0 || kTotalBits > 0); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits if (kNumBlocks != 0) { uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);