diff --git a/CMakeLists.txt b/CMakeLists.txt index a094d3261..62c678ea9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -597,7 +597,6 @@ set(SOURCES table/block_based/partitioned_filter_block.cc table/block_based/uncompression_dict_reader.cc table/block_fetcher.cc - table/bloom_block.cc table/cuckoo/cuckoo_table_builder.cc table/cuckoo/cuckoo_table_factory.cc table/cuckoo/cuckoo_table_reader.cc @@ -607,6 +606,7 @@ set(SOURCES table/merging_iterator.cc table/meta_blocks.cc table/persistent_cache_helper.cc + table/plain/plain_table_bloom.cc table/plain/plain_table_builder.cc table/plain/plain_table_factory.cc table/plain/plain_table_index.cc diff --git a/TARGETS b/TARGETS index 058e591e8..0a8388775 100644 --- a/TARGETS +++ b/TARGETS @@ -229,7 +229,6 @@ cpp_library( "table/block_based/partitioned_filter_block.cc", "table/block_based/uncompression_dict_reader.cc", "table/block_fetcher.cc", - "table/bloom_block.cc", "table/cuckoo/cuckoo_table_builder.cc", "table/cuckoo/cuckoo_table_factory.cc", "table/cuckoo/cuckoo_table_reader.cc", @@ -239,6 +238,7 @@ cpp_library( "table/merging_iterator.cc", "table/meta_blocks.cc", "table/persistent_cache_helper.cc", + "table/plain/plain_table_bloom.cc", "table/plain/plain_table_builder.cc", "table/plain/plain_table_factory.cc", "table/plain/plain_table_index.cc", diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 68df71768..a2f191080 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -24,8 +24,8 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" -#include "table/bloom_block.h" #include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" #include "table/plain/plain_table_factory.h" #include "table/plain/plain_table_key_coding.h" #include "table/plain/plain_table_reader.h" @@ -730,6 +730,54 @@ TEST_P(PlainTableDBTest, Iterator) { } } +namespace { +std::string NthKey(size_t n, char filler) { + std::string rv(16, filler); + rv[0] = n % 10; + rv[1] = (n / 10) % 10; + rv[2] = (n / 100) % 10; + rv[3] = (n / 1000) % 10; + return rv; +} +} // anonymous namespace + +TEST_P(PlainTableDBTest, BloomSchema) { + Options options = CurrentOptions(); + options.create_if_missing = true; + for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) { + options.bloom_locality = bloom_locality; + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 3; // high FP rate for test + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPlain; + + + bool expect_bloom_not_match = false; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + DestroyAndReopen(&options); + + for (unsigned i = 0; i < 2345; ++i) { + ASSERT_OK(Put(NthKey(i, 'y'), "added")); + } + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("added", Get(NthKey(42, 'y'))); + + for (unsigned i = 0; i < 32; ++i) { + // Known pattern of Bloom filter false positives can detect schema change + // with high probability. Known FPs stuffed into bits: + bool expect_fp = (bloom_locality ? 2421694657UL : 1785868347UL) + & (1UL << i); + expect_bloom_not_match = !expect_fp; + ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n'))); + } + } +} + namespace { std::string MakeLongKey(size_t length, char c) { return std::string(length, c); diff --git a/src.mk b/src.mk index 6d087861d..4c7689755 100644 --- a/src.mk +++ b/src.mk @@ -124,7 +124,6 @@ LIB_SOURCES = \ table/block_based/partitioned_filter_block.cc \ table/block_based/uncompression_dict_reader.cc \ table/block_fetcher.cc \ - table/bloom_block.cc \ table/cuckoo/cuckoo_table_builder.cc \ table/cuckoo/cuckoo_table_factory.cc \ table/cuckoo/cuckoo_table_reader.cc \ @@ -134,6 +133,7 @@ LIB_SOURCES = \ table/merging_iterator.cc \ table/meta_blocks.cc \ table/persistent_cache_helper.cc \ + table/plain/plain_table_bloom.cc \ table/plain/plain_table_builder.cc \ table/plain/plain_table_factory.cc \ table/plain/plain_table_index.cc \ diff --git a/table/bloom_block.cc b/table/bloom_block.cc deleted file mode 100644 index 61959030a..000000000 --- a/table/bloom_block.cc +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "table/bloom_block.h" - -#include -#include "rocksdb/slice.h" -#include "util/dynamic_bloom.h" - -namespace rocksdb { - -void BloomBlockBuilder::AddKeysHashes(const std::vector& keys_hashes) { - for (auto hash : keys_hashes) { - bloom_.AddHash(hash); - } -} - -Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } - -const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; -} // namespace rocksdb diff --git a/table/bloom_block.h b/table/bloom_block.h deleted file mode 100644 index 483fa25d9..000000000 --- a/table/bloom_block.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -#pragma once - -#include -#include -#include "util/dynamic_bloom.h" - -namespace rocksdb { -class Logger; - -class BloomBlockBuilder { - public: - static const std::string kBloomBlock; - - explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {} - - void SetTotalBits(Allocator* allocator, uint32_t total_bits, - uint32_t locality, size_t huge_page_tlb_size, - Logger* logger) { - bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, - logger); - } - - uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } - - void AddKeysHashes(const std::vector& keys_hashes); - - Slice Finish(); - - private: - DynamicBloom bloom_; -}; - -}; // namespace rocksdb diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc new file mode 100644 index 000000000..778b3b558 --- /dev/null +++ b/table/plain/plain_table_bloom.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/plain/plain_table_bloom.h" + +#include +#include +#include "util/dynamic_bloom.h" + +#include "memory/allocator.h" + + +namespace rocksdb { + +namespace { + +uint32_t GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_blocks = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_blocks an odd number to make sure more bits are involved + // when determining which block. + if (num_blocks % 2 == 0) { + num_blocks++; + } + + return num_blocks * (CACHE_LINE_SIZE * 8); +} +} + +PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes) + : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {} + +void PlainTableBloomV1::SetRawData(unsigned char* raw_data, uint32_t total_bits, + uint32_t num_blocks) { + data_ = reinterpret_cast(raw_data); + kTotalBits = total_bits; + kNumBlocks = num_blocks; +} + +void PlainTableBloomV1::SetTotalBits(Allocator* allocator, + uint32_t total_bits, uint32_t locality, + size_t huge_page_tlb_size, + Logger* logger) { + kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) + : (total_bits + 7) / 8 * 8; + kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; + + assert(kNumBlocks > 0 || kTotalBits > 0); + assert(kNumProbes > 0); + + uint32_t sz = kTotalBits / 8; + if (kNumBlocks > 0) { + sz += CACHE_LINE_SIZE - 1; + } + assert(allocator); + + char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); + memset(raw, 0, sz); + auto cache_line_offset = reinterpret_cast(raw) % CACHE_LINE_SIZE; + if (kNumBlocks > 0 && cache_line_offset > 0) { + raw += CACHE_LINE_SIZE - cache_line_offset; + } + data_ = reinterpret_cast(raw); +} + +void BloomBlockBuilder::AddKeysHashes(const std::vector& keys_hashes) { + for (auto hash : keys_hashes) { + bloom_.AddHash(hash); + } +} + +Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } + +const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; +} // namespace rocksdb diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h new file mode 100644 index 000000000..08c72b2dc --- /dev/null +++ b/table/plain/plain_table_bloom.h @@ -0,0 +1,161 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/slice.h" + +#include "port/port.h" +#include "util/hash.h" + +#include + +namespace rocksdb { +class Slice; +class Allocator; +class Logger; + +class PlainTableBloomV1 { + public: + // allocator: pass allocator to bloom filter, hence trace the usage of memory + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // locality: If positive, optimize for cache line locality, 0 otherwise. + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // within this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit PlainTableBloomV1(uint32_t num_probes = 6); + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger); + + ~PlainTableBloomV1() {} + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash) const; + + void Prefetch(uint32_t hash); + + uint32_t GetNumBlocks() const { return kNumBlocks; } + + Slice GetRawData() const { + return Slice(reinterpret_cast(data_), GetTotalBits() / 8); + } + + void SetRawData(unsigned char* raw_data, uint32_t total_bits, + uint32_t num_blocks = 0); + + uint32_t GetTotalBits() const { return kTotalBits; } + + bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; } + + private: + uint32_t kTotalBits; + uint32_t kNumBlocks; + const uint32_t kNumProbes; + + uint8_t* data_; +}; + +#if defined(_MSC_VER) +#pragma warning(push) +// local variable is initialized but not referenced +#pragma warning(disable : 4189) +#endif +inline void PlainTableBloomV1::Prefetch(uint32_t h) { + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + PREFETCH(&(data_[b / 8]), 0, 3); + } +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const { + assert(IsInitialized()); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + for (uint32_t i = 0; i < kNumProbes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) { + return false; + } + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + } + return true; +} + +inline void PlainTableBloomV1::AddHash(uint32_t h) { + assert(IsInitialized()); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + for (uint32_t i = 0; i < kNumProbes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + data_[bitpos / 8] |= (1 << (bitpos % 8)); + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + data_[bitpos / 8] |= (1 << (bitpos % 8)); + h += delta; + } + } +} + +class BloomBlockBuilder { + public: + static const std::string kBloomBlock; + + explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {} + + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger) { + bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, + logger); + } + + uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } + + void AddKeysHashes(const std::vector& keys_hashes); + + Slice Finish(); + + private: + PlainTableBloomV1 bloom_; +}; + +}; // namespace rocksdb diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index 4d50d8176..8a51b64e6 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -19,9 +19,9 @@ #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block_based/block_builder.h" -#include "table/bloom_block.h" #include "table/format.h" #include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" #include "table/plain/plain_table_factory.h" #include "table/plain/plain_table_index.h" #include "util/coding.h" diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h index 0a29098d6..ce2169a38 100644 --- a/table/plain/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -13,7 +13,7 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/bloom_block.h" +#include "table/plain/plain_table_bloom.h" #include "table/plain/plain_table_index.h" #include "table/plain/plain_table_key_coding.h" #include "table/table_builder.h" diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 63a28e34a..3d5c4f2db 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -21,11 +21,11 @@ #include "table/block_based/block.h" #include "table/block_based/filter_block.h" -#include "table/bloom_block.h" #include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" #include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" #include "table/plain/plain_table_factory.h" #include "table/plain/plain_table_key_coding.h" #include "table/two_level_iterator.h" diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index ab108b216..02539cc69 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -19,10 +19,10 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" +#include "table/plain/plain_table_bloom.h" #include "table/plain/plain_table_factory.h" #include "table/plain/plain_table_index.h" #include "table/table_reader.h" -#include "util/dynamic_bloom.h" #include "util/file_reader_writer.h" namespace rocksdb { @@ -155,7 +155,7 @@ class PlainTableReader: public TableReader { // Bloom filter is used to rule out non-existent key bool enable_bloom_; - DynamicBloom bloom_; + PlainTableBloomV1 bloom_; PlainTableReaderFileInfo file_info_; Arena arena_; CacheAllocationPtr index_block_alloc_; diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 4dfccb0bf..e5210d1fb 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -33,24 +33,7 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) { DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits, uint32_t locality, uint32_t num_probes, size_t huge_page_tlb_size, Logger* logger) - : DynamicBloom(num_probes) { - SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger); -} - -DynamicBloom::DynamicBloom(uint32_t num_probes) - : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {} - -void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits, - uint32_t num_blocks) { - data_ = reinterpret_cast*>(raw_data); - kTotalBits = total_bits; - kNumBlocks = num_blocks; -} - -void DynamicBloom::SetTotalBits(Allocator* allocator, - uint32_t total_bits, uint32_t locality, - size_t huge_page_tlb_size, - Logger* logger) { + : kNumProbes(num_probes) { kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) : (total_bits + 7) / 8 * 8; kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index 654bc9ad5..8b31f3c48 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -21,6 +21,9 @@ class Slice; class Allocator; class Logger; +// A Bloom filter intended only to be used in memory, never serialized in a way +// that could lead to schema incompatibility. Supports opt-in lock-free +// concurrent access. class DynamicBloom { public: // allocator: pass allocator to bloom filter, hence trace the usage of memory @@ -39,12 +42,6 @@ class DynamicBloom { size_t huge_page_tlb_size = 0, Logger* logger = nullptr); - explicit DynamicBloom(uint32_t num_probes = 6); - - void SetTotalBits(Allocator* allocator, uint32_t total_bits, - uint32_t locality, size_t huge_page_tlb_size, - Logger* logger); - ~DynamicBloom() {} // Assuming single threaded access to this function. @@ -69,17 +66,6 @@ class DynamicBloom { uint32_t GetNumBlocks() const { return kNumBlocks; } - Slice GetRawData() const { - return Slice(reinterpret_cast(data_), GetTotalBits() / 8); - } - - void SetRawData(unsigned char* raw_data, uint32_t total_bits, - uint32_t num_blocks = 0); - - uint32_t GetTotalBits() const { return kTotalBits; } - - bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; } - private: uint32_t kTotalBits; uint32_t kNumBlocks; @@ -126,7 +112,7 @@ inline bool DynamicBloom::MayContain(const Slice& key) const { #if defined(_MSC_VER) #pragma warning(push) // local variable is initialized but not referenced -#pragma warning(disable : 4189) +#pragma warning(disable : 4189) #endif inline void DynamicBloom::Prefetch(uint32_t h) { if (kNumBlocks != 0) { @@ -139,7 +125,6 @@ inline void DynamicBloom::Prefetch(uint32_t h) { #endif inline bool DynamicBloom::MayContainHash(uint32_t h) const { - assert(IsInitialized()); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits if (kNumBlocks != 0) { uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); @@ -171,7 +156,6 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const { template inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) { - assert(IsInitialized()); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits if (kNumBlocks != 0) { uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);