Adding option to save PlainTable index and bloom filter in SST file.

Summary:
Adding option to save PlainTable index and bloom filter in SST file.
If there is no bloom block and/or index block, PlainTableReader builds
new ones. Otherwise PlainTableReader just use these blocks.

Test Plan: make all check

Reviewers: sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D19527
main
Stanislau Hlebik 11 years ago
parent 92d73cbe78
commit 9d70cce047
  1. 102
      db/plain_table_db_test.cc
  2. 7
      include/rocksdb/table.h
  3. 23
      table/bloom_block.cc
  4. 37
      table/bloom_block.h
  5. 68
      table/meta_blocks.cc
  6. 15
      table/meta_blocks.h
  7. 97
      table/plain_table_builder.cc
  8. 48
      table/plain_table_builder.h
  9. 10
      table/plain_table_factory.cc
  10. 4
      table/plain_table_factory.h
  11. 196
      table/plain_table_index.cc
  12. 221
      table/plain_table_index.h
  13. 360
      table/plain_table_reader.cc
  14. 140
      table/plain_table_reader.h
  15. 7
      util/dynamic_bloom.cc
  16. 25
      util/dynamic_bloom.h
  17. 3
      util/hash.h

@ -23,6 +23,7 @@
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "table/meta_blocks.h"
#include "table/bloom_block.h"
#include "table/plain_table_factory.h"
#include "table/plain_table_reader.h"
#include "util/hash.h"
@ -70,10 +71,11 @@ class PlainTableDBTest {
plain_table_options.huge_page_tlb_size = 0;
plain_table_options.encoding_type = kPrefix;
plain_table_options.full_scan_mode = false;
plain_table_options.store_index_in_file = false;
options.table_factory.reset(NewPlainTableFactory(plain_table_options));
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 3));
options.prefix_extractor.reset(NewFixedPrefixTransform(8));
options.allow_mmap_reads = true;
return options;
@ -186,6 +188,8 @@ TEST(PlainTableDBTest, Empty) {
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
}
extern const uint64_t kPlainTableMagicNumber;
class TestPlainTableReader : public PlainTableReader {
public:
TestPlainTableReader(const EnvOptions& storage_options,
@ -195,7 +199,8 @@ class TestPlainTableReader : public PlainTableReader {
size_t index_sparseness,
const TableProperties* table_properties,
unique_ptr<RandomAccessFile>&& file,
const Options& options, bool* expect_bloom_not_match)
const Options& options, bool* expect_bloom_not_match,
bool store_index_in_file)
: PlainTableReader(options, std::move(file), storage_options, icomparator,
encoding_type, file_size, table_properties),
expect_bloom_not_match_(expect_bloom_not_match) {
@ -206,6 +211,19 @@ class TestPlainTableReader : public PlainTableReader {
bloom_bits_per_key, hash_table_ratio, index_sparseness,
2 * 1024 * 1024);
ASSERT_TRUE(s.ok());
TableProperties* props = const_cast<TableProperties*>(table_properties);
if (store_index_in_file) {
auto bloom_version_ptr = props->user_collected_properties.find(
PlainTablePropertyNames::kBloomVersion);
ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
if (options.bloom_locality > 0) {
auto num_blocks_ptr = props->user_collected_properties.find(
PlainTablePropertyNames::kNumBloomBlocks);
ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
}
}
}
virtual ~TestPlainTableReader() {}
@ -213,7 +231,11 @@ class TestPlainTableReader : public PlainTableReader {
private:
virtual bool MatchBloom(uint32_t hash) const override {
bool ret = PlainTableReader::MatchBloom(hash);
ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
if (*expect_bloom_not_match_) {
ASSERT_TRUE(!ret);
} else {
ASSERT_TRUE(ret);
}
return ret;
}
bool* expect_bloom_not_match_;
@ -228,6 +250,7 @@ class TestPlainTableFactory : public PlainTableFactory {
bloom_bits_per_key_(options.bloom_bits_per_key),
hash_table_ratio_(options.hash_table_ratio),
index_sparseness_(options.index_sparseness),
store_index_in_file_(options.store_index_in_file),
expect_bloom_not_match_(expect_bloom_not_match) {}
Status NewTableReader(const Options& options, const EnvOptions& soptions,
@ -239,6 +262,20 @@ class TestPlainTableFactory : public PlainTableFactory {
options.env, options.info_log.get(), &props);
ASSERT_TRUE(s.ok());
if (store_index_in_file_) {
BlockHandle bloom_block_handle;
s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
options.env, BloomBlockBuilder::kBloomBlock,
&bloom_block_handle);
ASSERT_TRUE(s.ok());
BlockHandle index_block_handle;
s = FindMetaBlock(
file.get(), file_size, kPlainTableMagicNumber, options.env,
PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
ASSERT_TRUE(s.ok());
}
auto& user_props = props->user_collected_properties;
auto encoding_type_prop =
user_props.find(PlainTablePropertyNames::kEncodingType);
@ -249,7 +286,8 @@ class TestPlainTableFactory : public PlainTableFactory {
std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
soptions, internal_comparator, encoding_type, file_size,
bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
std::move(file), options, expect_bloom_not_match_));
std::move(file), options, expect_bloom_not_match_,
store_index_in_file_));
*table = std::move(new_reader);
return s;
@ -259,6 +297,7 @@ class TestPlainTableFactory : public PlainTableFactory {
int bloom_bits_per_key_;
double hash_table_ratio_;
size_t index_sparseness_;
bool store_index_in_file_;
bool* expect_bloom_not_match_;
};
@ -268,6 +307,12 @@ TEST(PlainTableDBTest, Flush) {
for (EncodingType encoding_type : {kPlain, kPrefix}) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
for (int store_index_in_file = 0; store_index_in_file <= 1;
++store_index_in_file) {
if (!bloom_bits && store_index_in_file) {
continue;
}
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
@ -283,6 +328,7 @@ TEST(PlainTableDBTest, Flush) {
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
plain_table_options.encoding_type = encoding_type;
plain_table_options.full_scan_mode = false;
plain_table_options.store_index_in_file = store_index_in_file;
options.table_factory.reset(
NewPlainTableFactory(plain_table_options));
@ -295,12 +341,12 @@ TEST(PlainTableDBTest, Flush) {
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
plain_table_options.encoding_type = encoding_type;
plain_table_options.full_scan_mode = false;
plain_table_options.store_index_in_file = store_index_in_file;
options.table_factory.reset(
NewPlainTableFactory(plain_table_options));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
@ -311,17 +357,26 @@ TEST(PlainTableDBTest, Flush) {
ASSERT_EQ(1U, ptc.size());
auto row = ptc.begin();
auto tp = row->second;
ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
"plain_table_hash_table_size"));
if (!store_index_in_file) {
ASSERT_EQ(total_order ? "4" : "12",
(tp->user_collected_properties)
.at("plain_table_hash_table_size"));
ASSERT_EQ("0", (tp->user_collected_properties)
.at("plain_table_sub_index_size"));
} else {
ASSERT_EQ("0", (tp->user_collected_properties)
.at("plain_table_hash_table_size"));
ASSERT_EQ("0", (tp->user_collected_properties)
.at("plain_table_sub_index_size"));
}
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
}
}
}
}
}
TEST(PlainTableDBTest, Flush2) {
@ -330,7 +385,15 @@ TEST(PlainTableDBTest, Flush2) {
for (EncodingType encoding_type : {kPlain, kPrefix}) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
if (encoding_type == kPrefix && total_order == 1) {
for (int store_index_in_file = 0; store_index_in_file <= 1;
++store_index_in_file) {
if (encoding_type == kPrefix && total_order) {
continue;
}
if (!bloom_bits && store_index_in_file) {
continue;
}
if (total_order && store_index_in_file) {
continue;
}
bool expect_bloom_not_match = false;
@ -338,30 +401,23 @@ TEST(PlainTableDBTest, Flush2) {
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
PlainTableOptions plain_table_options;
if (total_order) {
options.prefix_extractor = nullptr;
PlainTableOptions plain_table_options;
plain_table_options.user_key_len = 0;
plain_table_options.bloom_bits_per_key = bloom_bits;
plain_table_options.hash_table_ratio = 0;
plain_table_options.index_sparseness = 2;
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
plain_table_options.encoding_type = encoding_type;
options.table_factory.reset(new TestPlainTableFactory(
&expect_bloom_not_match, plain_table_options));
} else {
PlainTableOptions plain_table_options;
plain_table_options.user_key_len = 0;
plain_table_options.bloom_bits_per_key = bloom_bits;
plain_table_options.hash_table_ratio = 0.75;
plain_table_options.index_sparseness = 16;
}
plain_table_options.user_key_len = kPlainTableVariableLength;
plain_table_options.bloom_bits_per_key = bloom_bits;
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
plain_table_options.encoding_type = encoding_type;
plain_table_options.store_index_in_file = store_index_in_file;
options.table_factory.reset(new TestPlainTableFactory(
&expect_bloom_not_match, plain_table_options));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("1000000000000foo", "v1"));
@ -389,7 +445,6 @@ TEST(PlainTableDBTest, Flush2) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
// Key doesn't exist any more but prefix exists.
if (total_order) {
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
@ -401,6 +456,7 @@ TEST(PlainTableDBTest, Flush2) {
}
}
}
}
}
TEST(PlainTableDBTest, Iterator) {

@ -119,6 +119,8 @@ enum EncodingType : char {
struct PlainTablePropertyNames {
static const std::string kPrefixExtractorName;
static const std::string kEncodingType;
static const std::string kBloomVersion;
static const std::string kNumBloomBlocks;
};
const uint32_t kPlainTableVariableLength = 0;
@ -166,6 +168,11 @@ EncodingType encoding_type = kPlain;
// @full_scan_mode: mode for reading the whole file one record by one without
// using the index.
bool full_scan_mode = false;
// @store_index_in_file: compute plain table index and bloom filter during
// file building and store it in file. When reading
// file, index will be mmaped instead of recomputation.
bool store_index_in_file = false;
};
// -- Plain Table with prefix-only seek

@ -0,0 +1,23 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "table/bloom_block.h"
#include <string>
#include "rocksdb/slice.h"
#include "util/dynamic_bloom.h"
namespace rocksdb {
void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t> keys_hashes) {
for (auto hash : keys_hashes) {
bloom_.AddHash(hash);
}
}
Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
} // namespace rocksdb

@ -0,0 +1,37 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <vector>
#include <string>
#include "util/dynamic_bloom.h"
namespace rocksdb {
class Logger;
class BloomBlockBuilder {
public:
static const std::string kBloomBlock;
explicit BloomBlockBuilder(uint32_t num_probes = 6)
: bloom_(num_probes, nullptr) {}
void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality,
size_t huge_page_tlb_size, Logger* logger) {
bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size,
logger);
}
uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
void AddKeysHashes(const std::vector<uint32_t> keys_hashes);
Slice Finish();
private:
DynamicBloom bloom_;
};
}; // namespace rocksdb

@ -273,4 +273,72 @@ Status FindMetaBlock(Iterator* meta_index_iter,
}
}
Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
uint64_t table_magic_number, Env* env,
const std::string& meta_block_name,
BlockHandle* block_handle) {
Footer footer(table_magic_number);
auto s = ReadFooterFromFile(file, file_size, &footer);
if (!s.ok()) {
return s;
}
auto metaindex_handle = footer.metaindex_handle();
BlockContents metaindex_contents;
ReadOptions read_options;
read_options.verify_checksums = false;
s = ReadBlockContents(file, footer, read_options, metaindex_handle,
&metaindex_contents, env, false);
if (!s.ok()) {
return s;
}
Block metaindex_block(metaindex_contents);
std::unique_ptr<Iterator> meta_iter;
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
}
Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
uint64_t table_magic_number, Env* env,
const std::string& meta_block_name,
BlockContents* contents) {
Footer footer(table_magic_number);
auto s = ReadFooterFromFile(file, file_size, &footer);
if (!s.ok()) {
return s;
}
// Reading metaindex block
auto metaindex_handle = footer.metaindex_handle();
BlockContents metaindex_contents;
ReadOptions read_options;
read_options.verify_checksums = false;
s = ReadBlockContents(file, footer, read_options, metaindex_handle,
&metaindex_contents, env, false);
if (!s.ok()) {
return s;
}
// Finding metablock
Block metaindex_block(metaindex_contents);
std::unique_ptr<Iterator> meta_iter;
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
BlockHandle block_handle;
s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
if (!s.ok()) {
return s;
}
// Reading metablock
s = ReadBlockContents(file, footer, read_options, block_handle, contents, env,
false);
return s;
}
} // namespace rocksdb

@ -15,6 +15,7 @@
#include "rocksdb/slice.h"
#include "rocksdb/table_properties.h"
#include "table/block_builder.h"
#include "table/format.h"
namespace rocksdb {
@ -128,4 +129,18 @@ Status FindMetaBlock(Iterator* meta_index_iter,
const std::string& meta_block_name,
BlockHandle* block_handle);
// Find the meta block
Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
uint64_t table_magic_number, Env* env,
const std::string& meta_block_name,
BlockHandle* block_handle);
// Read the specified meta block with name meta_block_name
// from `file` and initialize `contents` with contents of this block.
// Return Status::OK in case of success.
Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
uint64_t table_magic_number, Env* env,
const std::string& meta_block_name,
BlockContents* contents);
} // namespace rocksdb

@ -6,6 +6,7 @@
#ifndef ROCKSDB_LITE
#include "table/plain_table_builder.h"
#include <string>
#include <assert.h>
#include <map>
@ -17,6 +18,8 @@
#include "table/plain_table_factory.h"
#include "db/dbformat.h"
#include "table/block_builder.h"
#include "table/bloom_block.h"
#include "table/plain_table_index.h"
#include "table/filter_block.h"
#include "table/format.h"
#include "table/meta_blocks.h"
@ -54,20 +57,36 @@ Status WriteBlock(
extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
PlainTableBuilder::PlainTableBuilder(const Options& options, WritableFile* file,
uint32_t user_key_len,
EncodingType encoding_type,
size_t index_sparseness)
PlainTableBuilder::PlainTableBuilder(
const Options& options, WritableFile* file, uint32_t user_key_len,
EncodingType encoding_type, size_t index_sparseness,
uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size,
double hash_table_ratio, bool store_index_in_file)
: options_(options),
bloom_block_(num_probes),
file_(file),
bloom_bits_per_key_(bloom_bits_per_key),
huge_page_tlb_size_(huge_page_tlb_size),
encoder_(encoding_type, user_key_len, options.prefix_extractor.get(),
index_sparseness) {
index_sparseness),
store_index_in_file_(store_index_in_file),
prefix_extractor_(options.prefix_extractor.get()) {
// Build index block and save it in the file if hash_table_ratio > 0
if (store_index_in_file_) {
assert(hash_table_ratio > 0 || IsTotalOrderMode());
index_builder_.reset(
new PlainTableIndexBuilder(&arena_, options, index_sparseness,
hash_table_ratio, huge_page_tlb_size_));
assert(bloom_bits_per_key_ > 0);
properties_.user_collected_properties
[PlainTablePropertyNames::kBloomVersion] = "1"; // For future use
}
properties_.fixed_key_len = user_key_len;
// for plain table, we put all the data in a big chuck.
properties_.num_data_blocks = 1;
// emphasize that currently plain table doesn't have persistent index or
// filter block.
// Fill it later if store_index_in_file_ == true
properties_.index_size = 0;
properties_.filter_size = 0;
// To support roll-back to previous version, now still use version 0 for
@ -100,9 +119,28 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
char meta_bytes_buf[6];
size_t meta_bytes_buf_size = 0;
ParsedInternalKey internal_key;
ParseInternalKey(key, &internal_key);
// Store key hash
if (store_index_in_file_) {
if (options_.prefix_extractor.get() == nullptr) {
keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
} else {
Slice prefix =
options_.prefix_extractor->Transform(internal_key.user_key);
keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
}
}
// Write value
auto prev_offset = offset_;
// Write out the key
encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
&meta_bytes_buf_size);
if (SaveIndexInFile()) {
index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
}
// Write value length
int value_size = value.size();
@ -134,11 +172,50 @@ Status PlainTableBuilder::Finish() {
properties_.data_size = offset_;
// Write the following blocks
// 1. [meta block: properties]
// 2. [metaindex block]
// 3. [footer]
// 1. [meta block: bloom] - optional
// 2. [meta block: index] - optional
// 3. [meta block: properties]
// 4. [metaindex block]
// 5. [footer]
MetaIndexBuilder meta_index_builer;
if (store_index_in_file_ && (properties_.num_entries > 0)) {
bloom_block_.SetTotalBits(
&arena_, properties_.num_entries * bloom_bits_per_key_,
options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get());
PutVarint32(&properties_.user_collected_properties
[PlainTablePropertyNames::kNumBloomBlocks],
bloom_block_.GetNumBlocks());
bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
BlockHandle bloom_block_handle;
auto finish_result = bloom_block_.Finish();
properties_.filter_size = finish_result.size();
auto s = WriteBlock(finish_result, file_, &offset_, &bloom_block_handle);
if (!s.ok()) {
return s;
}
BlockHandle index_block_handle;
finish_result = index_builder_->Finish();
properties_.index_size = finish_result.size();
s = WriteBlock(finish_result, file_, &offset_, &index_block_handle);
if (!s.ok()) {
return s;
}
meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
index_block_handle);
}
// Calculate bloom block size and index block size
PropertyBlockBuilder property_block_builder;
// -- Add basic properties
property_block_builder.AddTableProperty(properties_);

@ -13,6 +13,8 @@
#include "table/plain_table_key_coding.h"
#include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/bloom_block.h"
#include "table/plain_table_index.h"
namespace rocksdb {
@ -30,7 +32,10 @@ class PlainTableBuilder: public TableBuilder {
// that the caller does not know which level the output file will reside.
PlainTableBuilder(const Options& options, WritableFile* file,
uint32_t user_key_size, EncodingType encoding_type,
size_t index_sparseness);
size_t index_sparseness, uint32_t bloom_bits_per_key,
uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
double hash_table_ratio = 0,
bool store_index_in_file = false);
// REQUIRES: Either Finish() or Abandon() has been called.
~PlainTableBuilder();
@ -62,18 +67,59 @@ class PlainTableBuilder: public TableBuilder {
// Finish() call, returns the size of the final generated file.
uint64_t FileSize() const override;
bool SaveIndexInFile() const { return store_index_in_file_; }
private:
Arena arena_;
Options options_;
std::vector<std::unique_ptr<TablePropertiesCollector>>
table_properties_collectors_;
BloomBlockBuilder bloom_block_;
std::unique_ptr<PlainTableIndexBuilder> index_builder_;
WritableFile* file_;
uint64_t offset_ = 0;
uint32_t bloom_bits_per_key_;
uint32_t huge_page_tlb_size_;
Status status_;
TableProperties properties_;
PlainTableKeyEncoder encoder_;
bool store_index_in_file_;
std::vector<uint32_t> keys_or_prefixes_hashes_;
bool closed_ = false; // Either Finish() or Abandon() has been called.
const SliceTransform* prefix_extractor_;
Slice GetPrefix(const Slice& target) const {
assert(target.size() >= 8); // target is internal key
return GetPrefixFromUserKey(GetUserKey(target));
}
Slice GetPrefix(const ParsedInternalKey& target) const {
return GetPrefixFromUserKey(target.user_key);
}
Slice GetUserKey(const Slice& key) const {
return Slice(key.data(), key.size() - 8);
}
Slice GetPrefixFromUserKey(const Slice& user_key) const {
if (!IsTotalOrderMode()) {
return prefix_extractor_->Transform(user_key);
} else {
// Use empty slice as prefix if prefix_extractor is not set.
// In that case,
// it falls back to pure binary search and
// total iterator seek is supported.
return Slice();
}
}
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
// No copying allowed
PlainTableBuilder(const PlainTableBuilder&) = delete;
void operator=(const PlainTableBuilder&) = delete;

@ -30,7 +30,9 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator,
WritableFile* file, CompressionType compression_type) const {
return new PlainTableBuilder(options, file, user_key_len_, encoding_type_,
index_sparseness_);
index_sparseness_, bloom_bits_per_key_, 6,
huge_page_tlb_size_, hash_table_ratio_,
store_index_in_file_);
}
extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
@ -43,5 +45,11 @@ const std::string PlainTablePropertyNames::kPrefixExtractorName =
const std::string PlainTablePropertyNames::kEncodingType =
"rocksdb.plain.table.encoding.type";
const std::string PlainTablePropertyNames::kBloomVersion =
"rocksdb.plain.table.bloom.version";
const std::string PlainTablePropertyNames::kNumBloomBlocks =
"rocksdb.plain.table.bloom.numblocks";
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -151,7 +151,8 @@ class PlainTableFactory : public TableFactory {
index_sparseness_(options.index_sparseness),
huge_page_tlb_size_(options.huge_page_tlb_size),
encoding_type_(options.encoding_type),
full_scan_mode_(options.full_scan_mode) {}
full_scan_mode_(options.full_scan_mode),
store_index_in_file_(options.store_index_in_file) {}
const char* Name() const override { return "PlainTable"; }
Status NewTableReader(const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator,
@ -173,6 +174,7 @@ class PlainTableFactory : public TableFactory {
size_t huge_page_tlb_size_;
EncodingType encoding_type_;
bool full_scan_mode_;
bool store_index_in_file_;
};
} // namespace rocksdb

@ -0,0 +1,196 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "table/plain_table_index.h"
#include "util/coding.h"
#include "util/hash.h"
namespace rocksdb {
namespace {
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
assert(num_buckets > 0);
return hash % num_buckets;
}
}
void PlainTableIndex::InitFromRawData(Slice data) {
assert(GetVarint32(&data, &index_size_));
assert(index_size_ > 0);
assert(GetVarint32(&data, &num_prefixes_));
sub_index_size_ = data.size() - index_size_ * kOffsetLen;
char* index_data_begin = const_cast<char*>(data.data());
index_ = reinterpret_cast<uint32_t*>(index_data_begin);
sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
}
PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
uint32_t prefix_hash, uint32_t* bucket_value) const {
int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
*bucket_value = index_[bucket];
if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
*bucket_value ^= kSubIndexMask;
return kSubindex;
}
if (*bucket_value >= kMaxFileSize) {
return kNoPrefixForBucket;
} else {
// point directly to the file
return kDirectToFile;
}
}
void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash,
uint32_t offset) {
if (num_records_in_current_group_ == kNumRecordsPerGroup) {
current_group_ = AllocateNewGroup();
num_records_in_current_group_ = 0;
}
auto& new_record = current_group_[num_records_in_current_group_++];
new_record.hash = hash;
new_record.offset = offset;
new_record.next = nullptr;
}
void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
uint64_t key_offset) {
if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
++num_prefixes_;
if (!is_first_record_) {
keys_per_prefix_hist_.Add(num_keys_per_prefix_);
}
num_keys_per_prefix_ = 0;
prev_key_prefix_ = key_prefix_slice.ToString();
prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
due_index_ = true;
}
if (due_index_) {
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
due_index_ = false;
}
num_keys_per_prefix_++;
if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
due_index_ = true;
}
is_first_record_ = false;
}
Slice PlainTableIndexBuilder::Finish() {
AllocateIndex();
std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
std::vector<uint32_t> entries_per_bucket(index_size_, 0);
BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
keys_per_prefix_hist_.Add(num_keys_per_prefix_);
Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
keys_per_prefix_hist_.ToString().c_str());
// From the temp data structure, populate indexes.
return FillIndexes(hash_to_offsets, entries_per_bucket);
}
void PlainTableIndexBuilder::AllocateIndex() {
if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
// Fall back to pure binary search if the user fails to specify a prefix
// extractor.
index_size_ = 1;
} else {
double hash_table_size_multipier = 1.0 / hash_table_ratio_;
index_size_ = num_prefixes_ * hash_table_size_multipier + 1;
assert(index_size_ > 0);
}
}
void PlainTableIndexBuilder::BucketizeIndexes(
std::vector<IndexRecord*>* hash_to_offsets,
std::vector<uint32_t>* entries_per_bucket) {
bool first = true;
uint32_t prev_hash = 0;
size_t num_records = record_list_.GetNumRecords();
for (size_t i = 0; i < num_records; i++) {
IndexRecord* index_record = record_list_.At(i);
uint32_t cur_hash = index_record->hash;
if (first || prev_hash != cur_hash) {
prev_hash = cur_hash;
first = false;
}
uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
index_record->next = prev_bucket_head;
(*hash_to_offsets)[bucket] = index_record;
(*entries_per_bucket)[bucket]++;
}
sub_index_size_ = 0;
for (auto entry_count : *entries_per_bucket) {
if (entry_count <= 1) {
continue;
}
// Only buckets with more than 1 entry will have subindex.
sub_index_size_ += VarintLength(entry_count);
// total bytes needed to store these entries' in-file offsets.
sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
}
}
Slice PlainTableIndexBuilder::FillIndexes(
const std::vector<IndexRecord*>& hash_to_offsets,
const std::vector<uint32_t>& entries_per_bucket) {
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
sub_index_size_);
auto total_allocate_size = GetTotalSize();
char* allocated = arena_->AllocateAligned(
total_allocate_size, huge_page_tlb_size_, options_.info_log.get());
auto temp_ptr = EncodeVarint32(allocated, index_size_);
uint32_t* index =
reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
char* sub_index = reinterpret_cast<char*>(index + index_size_);
size_t sub_index_offset = 0;
for (uint32_t i = 0; i < index_size_; i++) {
uint32_t num_keys_for_bucket = entries_per_bucket[i];
switch (num_keys_for_bucket) {
case 0:
// No key for bucket
index[i] = PlainTableIndex::kMaxFileSize;
break;
case 1:
// point directly to the file offset
index[i] = hash_to_offsets[i]->offset;
break;
default:
// point to second level indexes.
index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask;
char* prev_ptr = &sub_index[sub_index_offset];
char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
sub_index_offset += (cur_ptr - prev_ptr);
char* sub_index_pos = &sub_index[sub_index_offset];
IndexRecord* record = hash_to_offsets[i];
int j;
for (j = num_keys_for_bucket - 1; j >= 0 && record;
j--, record = record->next) {
EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
}
assert(j == -1 && record == nullptr);
sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
assert(sub_index_offset <= sub_index_size_);
break;
}
}
assert(sub_index_offset == sub_index_size_);
Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
index_size_, sub_index_size_);
return Slice(allocated, GetTotalSize());
}
const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
"PlainTableIndexBlock";
}; // namespace rocksdb

@ -0,0 +1,221 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <string>
#include <vector>
#include "db/dbformat.h"
#include "rocksdb/options.h"
#include "util/murmurhash.h"
#include "util/hash.h"
#include "util/arena.h"
#include "util/histogram.h"
namespace rocksdb {
// PlainTableIndex contains buckets size of index_size_, each is a
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
// and the first bit of the integer indicates type of the offset.
//
// +--------------+------------------------------------------------------+
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
// +--------------+------------------------------------------------------+
//
// Explanation for the "flag bit":
//
// 0 indicates that the bucket contains only one prefix (no conflict when
// hashing this prefix), whose first row starts from this offset of the
// file.
// 1 indicates that the bucket contains more than one prefixes, or there
// are too many rows for one prefix so we need a binary search for it. In
// this case, the offset indicates the offset of sub_index_ holding the
// binary search indexes of keys for those rows. Those binary search indexes
// are organized in this way:
//
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
// it, there are N 32-bit integers, each points of an offset of the file,
// which
// points to starting of a row. Those offsets need to be guaranteed to be in
// ascending order so the keys they are pointing to are also in ascending
// order
// to make sure we can use them to do binary searches. Below is visual
// presentation of a bucket.
//
// <begin>
// number_of_records: varint32
// record 1 file offset: fixedint32
// record 2 file offset: fixedint32
// ....
// record N file offset: fixedint32
// <end>
class PlainTableIndex {
public:
enum IndexSearchResult {
kNoPrefixForBucket = 0,
kDirectToFile = 1,
kSubindex = 2
};
explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
PlainTableIndex()
: index_size_(0),
sub_index_size_(0),
num_prefixes_(0),
index_(nullptr),
sub_index_(nullptr) {}
IndexSearchResult GetOffset(uint32_t prefix_hash,
uint32_t* bucket_value) const;
void InitFromRawData(Slice data);
const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
uint32_t* upper_bound) const {
const char* index_ptr = &sub_index_[offset];
return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
}
uint32_t GetIndexSize() const { return index_size_; }
uint32_t GetSubIndexSize() const { return sub_index_size_; }
uint32_t GetNumPrefixes() const { return num_prefixes_; }
static const uint64_t kMaxFileSize = (1u << 31) - 1;
static const uint32_t kSubIndexMask = 0x80000000;
static const size_t kOffsetLen = sizeof(uint32_t);
private:
uint32_t index_size_;
size_t sub_index_size_;
uint32_t num_prefixes_;
uint32_t* index_;
char* sub_index_;
};
// PlainTableIndexBuilder is used to create plain table index.
// After calling Finish(), it returns Slice, which is usually
// used either to initialize PlainTableIndex or
// to save index to sst file.
// For more details about the index, please refer to:
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
// #wiki-in-memory-index-format
class PlainTableIndexBuilder {
public:
PlainTableIndexBuilder(Arena* arena, const Options& options,
uint32_t index_sparseness, double hash_table_ratio,
double huge_page_tlb_size)
: arena_(arena),
options_(options),
record_list_(kRecordsPerGroup),
is_first_record_(true),
due_index_(false),
num_prefixes_(0),
num_keys_per_prefix_(0),
prev_key_prefix_hash_(0),
index_sparseness_(index_sparseness),
prefix_extractor_(options.prefix_extractor.get()),
hash_table_ratio_(hash_table_ratio),
huge_page_tlb_size_(huge_page_tlb_size) {}
void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset);
Slice Finish();
uint32_t GetTotalSize() const {
return VarintLength(index_size_) + VarintLength(num_prefixes_) +
PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
}
static const std::string kPlainTableIndexBlock;
private:
struct IndexRecord {
uint32_t hash; // hash of the prefix
uint32_t offset; // offset of a row
IndexRecord* next;
};
// Helper class to track all the index records
class IndexRecordList {
public:
explicit IndexRecordList(size_t num_records_per_group)
: kNumRecordsPerGroup(num_records_per_group),
current_group_(nullptr),
num_records_in_current_group_(num_records_per_group) {}
~IndexRecordList() {
for (size_t i = 0; i < groups_.size(); i++) {
delete[] groups_[i];
}
}
void AddRecord(murmur_t hash, uint32_t offset);
size_t GetNumRecords() const {
return (groups_.size() - 1) * kNumRecordsPerGroup +
num_records_in_current_group_;
}
IndexRecord* At(size_t index) {
return &(groups_[index / kNumRecordsPerGroup]
[index % kNumRecordsPerGroup]);
}
private:
IndexRecord* AllocateNewGroup() {
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
groups_.push_back(result);
return result;
}
// Each group in `groups_` contains fix-sized records (determined by
// kNumRecordsPerGroup). Which can help us minimize the cost if resizing
// occurs.
const size_t kNumRecordsPerGroup;
IndexRecord* current_group_;
// List of arrays allocated
std::vector<IndexRecord*> groups_;
size_t num_records_in_current_group_;
};
void AllocateIndex();
// Internal helper function to bucket index record list to hash buckets.
void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
std::vector<uint32_t>* entries_per_bucket);
// Internal helper class to fill the indexes and bloom filters to internal
// data structures.
Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
const std::vector<uint32_t>& entries_per_bucket);
Arena* arena_;
Options options_;
HistogramImpl keys_per_prefix_hist_;
IndexRecordList record_list_;
bool is_first_record_;
bool due_index_;
uint32_t num_prefixes_;
uint32_t num_keys_per_prefix_;
uint32_t prev_key_prefix_hash_;
uint32_t index_sparseness_;
uint32_t index_size_;
size_t sub_index_size_;
const SliceTransform* prefix_extractor_;
double hash_table_ratio_;
double huge_page_tlb_size_;
std::string prev_key_prefix_;
static const size_t kRecordsPerGroup = 256;
};
}; // namespace rocksdb

@ -3,6 +3,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef ROCKSDB_LITE
#include "table/plain_table_reader.h"
#include <string>
@ -18,6 +19,7 @@
#include "rocksdb/statistics.h"
#include "table/block.h"
#include "table/bloom_block.h"
#include "table/filter_block.h"
#include "table/format.h"
#include "table/meta_blocks.h"
@ -39,15 +41,6 @@ namespace rocksdb {
namespace {
inline uint32_t GetSliceHash(const Slice& s) {
return Hash(s.data(), s.size(), 397) ;
}
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
assert(num_buckets >= 0);
return hash % num_buckets;
}
// Safely getting a uint32_t element from a char array, where, starting from
// `base`, every 4 bytes are considered as an fixed 32 bit integer.
inline uint32_t GetFixed32Element(const char* base, size_t offset) {
@ -103,6 +96,7 @@ PlainTableReader::PlainTableReader(const Options& options,
const TableProperties* table_properties)
: internal_comparator_(icomparator),
encoding_type_(encoding_type),
full_scan_mode_(false),
data_end_offset_(table_properties->data_size),
user_key_len_(table_properties->fixed_key_len),
prefix_extractor_(options.prefix_extractor.get()),
@ -126,8 +120,7 @@ Status PlainTableReader::Open(const Options& options,
double hash_table_ratio, size_t index_sparseness,
size_t huge_page_tlb_size, bool full_scan_mode) {
assert(options.allow_mmap_reads);
if (file_size > kMaxFileSize) {
if (file_size > PlainTableIndex::kMaxFileSize) {
return Status::NotSupported("File is too large for PlainTableReader!");
}
@ -173,7 +166,6 @@ Status PlainTableReader::Open(const Options& options,
return s;
}
// -- Populate Index
if (!full_scan_mode) {
s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
index_sparseness, huge_page_tlb_size);
@ -183,7 +175,7 @@ Status PlainTableReader::Open(const Options& options,
} else {
// Flag to indicate it is a full scan mode so that none of the indexes
// can be used.
new_reader->index_size_ = kFullScanModeFlag;
new_reader->full_scan_mode_ = true;
}
*table_reader = std::move(new_reader);
@ -203,79 +195,15 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
}
}
struct PlainTableReader::IndexRecord {
uint32_t hash; // hash of the prefix
uint32_t offset; // offset of a row
IndexRecord* next;
};
// Helper class to track all the index records
class PlainTableReader::IndexRecordList {
public:
explicit IndexRecordList(size_t num_records_per_group)
: kNumRecordsPerGroup(num_records_per_group),
current_group_(nullptr),
num_records_in_current_group_(num_records_per_group) {}
~IndexRecordList() {
for (size_t i = 0; i < groups_.size(); i++) {
delete[] groups_[i];
}
}
void AddRecord(murmur_t hash, uint32_t offset) {
if (num_records_in_current_group_ == kNumRecordsPerGroup) {
current_group_ = AllocateNewGroup();
num_records_in_current_group_ = 0;
}
auto& new_record = current_group_[num_records_in_current_group_++];
new_record.hash = hash;
new_record.offset = offset;
new_record.next = nullptr;
}
size_t GetNumRecords() const {
return (groups_.size() - 1) * kNumRecordsPerGroup +
num_records_in_current_group_;
}
IndexRecord* At(size_t index) {
return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
}
private:
IndexRecord* AllocateNewGroup() {
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
groups_.push_back(result);
return result;
}
// Each group in `groups_` contains fix-sized records (determined by
// kNumRecordsPerGroup). Which can help us minimize the cost if resizing
// occurs.
const size_t kNumRecordsPerGroup;
IndexRecord* current_group_;
// List of arrays allocated
std::vector<IndexRecord*> groups_;
size_t num_records_in_current_group_;
};
Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
int* num_prefixes,
int bloom_bits_per_key,
size_t index_sparseness) {
Status PlainTableReader::PopulateIndexRecordList(
PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
Slice prev_key_prefix_slice;
uint32_t prev_key_prefix_hash = 0;
uint32_t pos = data_start_offset_;
int num_keys_per_prefix = 0;
bool is_first_record = true;
HistogramImpl keys_per_prefix_hist;
// Need map to be ordered to make sure sub indexes generated
// are in order.
*num_prefixes = 0;
bool is_first_record = true;
Slice key_prefix_slice;
PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
options_.prefix_extractor.get());
bool due_index = false;
while (pos < data_end_offset_) {
uint32_t key_offset = pos;
ParsedInternalKey key;
@ -285,152 +213,53 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
if (!s.ok()) {
return s;
}
key_prefix_slice = GetPrefix(key);
if (enable_bloom_) {
// total order mode and bloom filter is enabled.
bloom_.AddHash(GetSliceHash(key.user_key));
}
Slice key_prefix_slice = GetPrefix(key);
} else {
if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
++(*num_prefixes);
if (!is_first_record) {
keys_per_prefix_hist.Add(num_keys_per_prefix);
prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
}
num_keys_per_prefix = 0;
prev_key_prefix_slice = key_prefix_slice;
prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
due_index = true;
}
}
if (due_index) {
if (!seekable) {
index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
if (!seekable && is_first_record) {
return Status::Corruption("Key for a prefix is not seekable");
}
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
record_list->AddRecord(prev_key_prefix_hash, key_offset);
due_index = false;
}
num_keys_per_prefix++;
if (index_sparseness == 0 || num_keys_per_prefix % index_sparseness == 0) {
due_index = true;
}
is_first_record = false;
}
keys_per_prefix_hist.Add(num_keys_per_prefix);
Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
keys_per_prefix_hist.ToString().c_str());
prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
index_.InitFromRawData(index_builder->Finish());
return Status::OK();
}
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes,
int bloom_bits_per_key,
double hash_table_ratio,
size_t huge_page_tlb_size) {
if (prefix_extractor_ != nullptr) {
void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
int num_prefixes,
size_t huge_page_tlb_size,
vector<uint32_t>* prefix_hashes) {
if (!IsTotalOrderMode()) {
uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
if (bloom_total_bits > 0) {
enable_bloom_ = true;
bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality,
huge_page_tlb_size, options_.info_log.get());
FillBloom(prefix_hashes);
}
}
if (prefix_extractor_ == nullptr || hash_table_ratio <= 0) {
// Fall back to pure binary search if the user fails to specify a prefix
// extractor.
index_size_ = 1;
} else {
double hash_table_size_multipier = 1.0 / hash_table_ratio;
index_size_ = num_prefixes * hash_table_size_multipier + 1;
}
}
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
IndexRecordList* record_list, std::vector<IndexRecord*>* hash_to_offsets,
std::vector<uint32_t>* entries_per_bucket) {
bool first = true;
uint32_t prev_hash = 0;
size_t num_records = record_list->GetNumRecords();
for (size_t i = 0; i < num_records; i++) {
IndexRecord* index_record = record_list->At(i);
uint32_t cur_hash = index_record->hash;
if (first || prev_hash != cur_hash) {
prev_hash = cur_hash;
first = false;
if (enable_bloom_ && !IsTotalOrderMode()) {
bloom_.AddHash(cur_hash);
}
}
uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
index_record->next = prev_bucket_head;
(*hash_to_offsets)[bucket] = index_record;
(*entries_per_bucket)[bucket]++;
}
size_t sub_index_size = 0;
for (auto entry_count : *entries_per_bucket) {
if (entry_count <= 1) {
continue;
}
// Only buckets with more than 1 entry will have subindex.
sub_index_size += VarintLength(entry_count);
// total bytes needed to store these entries' in-file offsets.
sub_index_size += entry_count * kOffsetLen;
}
return sub_index_size;
}
void PlainTableReader::FillIndexes(
const size_t kSubIndexSize,
const std::vector<IndexRecord*>& hash_to_offsets,
const std::vector<uint32_t>& entries_per_bucket,
size_t huge_page_tlb_size) {
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
kSubIndexSize);
auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
char* allocated = arena_.AllocateAligned(
total_allocate_size, huge_page_tlb_size, options_.info_log.get());
index_ = reinterpret_cast<uint32_t*>(allocated);
sub_index_ = allocated + sizeof(uint32_t) * index_size_;
size_t sub_index_offset = 0;
for (int i = 0; i < index_size_; i++) {
uint32_t num_keys_for_bucket = entries_per_bucket[i];
switch (num_keys_for_bucket) {
case 0:
// No key for bucket
index_[i] = data_end_offset_;
break;
case 1:
// point directly to the file offset
index_[i] = hash_to_offsets[i]->offset;
break;
default:
// point to second level indexes.
index_[i] = sub_index_offset | kSubIndexMask;
char* prev_ptr = &sub_index_[sub_index_offset];
char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
sub_index_offset += (cur_ptr - prev_ptr);
char* sub_index_pos = &sub_index_[sub_index_offset];
IndexRecord* record = hash_to_offsets[i];
int j;
for (j = num_keys_for_bucket - 1; j >= 0 && record;
j--, record = record->next) {
EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
}
assert(j == -1 && record == nullptr);
sub_index_offset += kOffsetLen * num_keys_for_bucket;
assert(sub_index_offset <= kSubIndexSize);
break;
}
void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
assert(bloom_.IsInitialized());
for (auto prefix_hash : *prefix_hashes) {
bloom_.AddHash(prefix_hash);
}
assert(sub_index_offset == kSubIndexSize);
Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
index_size_, kSubIndexSize);
}
Status PlainTableReader::MmapDataFile() {
@ -445,21 +274,50 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
size_t huge_page_tlb_size) {
assert(props != nullptr);
table_properties_.reset(props);
// options.prefix_extractor is requried for a hash-based look-up.
if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
BlockContents bloom_block_contents;
auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
options_.env, BloomBlockBuilder::kBloomBlock,
&bloom_block_contents);
bool index_in_file = s.ok();
BlockContents index_block_contents;
s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
&index_block_contents);
index_in_file &= s.ok();
Slice* bloom_block;
if (index_in_file) {
bloom_block = &bloom_block_contents.data;
} else {
bloom_block = nullptr;
}
// index_in_file == true only if there are kBloomBlock and
// kPlainTableIndexBlock
// in file
Slice* index_block;
if (index_in_file) {
index_block = &index_block_contents.data;
} else {
index_block = nullptr;
}
if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
// options.prefix_extractor is requried for a hash-based look-up.
return Status::NotSupported(
"PlainTable requires a prefix extractor enable prefix hash mode.");
}
IndexRecordList record_list(kRecordsPerGroup);
// First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
// for a prefix (starting from the first one), generate a record of (hash,
// offset) and append it to IndexRecordList, which is a data structure created
// to store them.
int num_prefixes;
if (!index_in_file) {
// Allocate bloom filter here for total order mode.
if (IsTotalOrderMode()) {
uint32_t num_bloom_bits =
@ -470,34 +328,57 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
huge_page_tlb_size, options_.info_log.get());
}
}
} else {
enable_bloom_ = true;
auto num_blocks_property = props->user_collected_properties.find(
PlainTablePropertyNames::kNumBloomBlocks);
uint32_t num_blocks = 0;
if (num_blocks_property != props->user_collected_properties.end()) {
Slice temp_slice(num_blocks_property->second);
if (!GetVarint32(&temp_slice, &num_blocks)) {
num_blocks = 0;
}
}
// cast away const qualifier, because bloom_ won't be changed
bloom_.SetRawData(
const_cast<unsigned char*>(
reinterpret_cast<const unsigned char*>(bloom_block->data())),
bloom_block->size() * 8, num_blocks);
}
PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness,
hash_table_ratio, huge_page_tlb_size);
Status s = PopulateIndexRecordList(&record_list, &num_prefixes,
bloom_bits_per_key, index_sparseness);
std::vector<uint32_t> prefix_hashes;
if (!index_in_file) {
Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
if (!s.ok()) {
return s;
}
// Calculated hash table and bloom filter size and allocate memory for indexes
// and bloom filter based on the number of prefixes.
AllocateIndexAndBloom(num_prefixes, bloom_bits_per_key, hash_table_ratio,
huge_page_tlb_size);
// Bucketize all the index records to a temp data structure, in which for
// each bucket, we generate a linked list of IndexRecord, in reversed order.
std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
std::vector<uint32_t> entries_per_bucket(index_size_, 0);
size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
&record_list, &hash_to_offsets, &entries_per_bucket);
// From the temp data structure, populate indexes.
FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket,
huge_page_tlb_size);
} else {
index_.InitFromRawData(*index_block);
}
if (!index_in_file) {
// Calculated bloom filter size and allocate memory for
// bloom filter based on the number of prefixes, then fill it.
AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
huge_page_tlb_size, &prefix_hashes);
}
// Fill two table properties.
// TODO(sdong): after we have the feature of storing index in file, this
// properties need to be populated to index_size instead.
if (!index_in_file) {
props->user_collected_properties["plain_table_hash_table_size"] =
std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
props->user_collected_properties["plain_table_sub_index_size"] =
std::to_string(index_.GetSubIndexSize());
} else {
props->user_collected_properties["plain_table_hash_table_size"] =
std::to_string(index_size_ * 4U);
std::to_string(0);
props->user_collected_properties["plain_table_sub_index_size"] =
std::to_string(sub_index_size_needed);
std::to_string(0);
}
return Status::OK();
}
@ -506,24 +387,21 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
uint32_t prefix_hash, bool& prefix_matched,
uint32_t* offset) const {
prefix_matched = false;
int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
uint32_t bucket_value = index_[bucket];
if (bucket_value == data_end_offset_) {
uint32_t prefix_index_offset;
auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
if (res == PlainTableIndex::kNoPrefixForBucket) {
*offset = data_end_offset_;
return Status::OK();
} else if ((bucket_value & kSubIndexMask) == 0) {
// point directly to the file
*offset = bucket_value;
} else if (res == PlainTableIndex::kDirectToFile) {
*offset = prefix_index_offset;
return Status::OK();
}
// point to sub-index, need to do a binary search
uint32_t upper_bound;
const char* base_ptr =
index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
uint32_t low = 0;
uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
const char* index_ptr = &sub_index_[prefix_index_offset];
uint32_t upper_bound = 0;
const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound);
uint32_t high = upper_bound;
ParsedInternalKey mid_key;
ParsedInternalKey parsed_target;
@ -593,9 +471,6 @@ bool PlainTableReader::MatchBloom(uint32_t hash) const {
return !enable_bloom_ || bloom_.MayContainHash(hash);
}
Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const {
return GetPrefixFromUserKey(target.user_key);
}
Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
ParsedInternalKey* parsed_key,
@ -650,8 +525,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
Slice prefix_slice;
uint32_t prefix_hash;
if (IsTotalOrderMode()) {
if (index_size_ == kFullScanModeFlag) {
// Full Scan Mode
if (full_scan_mode_) {
status_ =
Status::InvalidArgument("Get() is not allowed in full scan mode.");
}
@ -682,7 +556,6 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
if (!ParseInternalKey(target, &parsed_target)) {
return Status::Corruption(Slice());
}
Slice found_value;
PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
options_.prefix_extractor.get());
@ -747,13 +620,12 @@ void PlainTableIterator::Seek(const Slice& target) {
// If the user doesn't set prefix seek option and we are not able to do a
// total Seek(). assert failure.
if (!use_prefix_seek_) {
if (table_->index_size_ == PlainTableReader::kFullScanModeFlag) {
// Full Scan Mode.
if (table_->full_scan_mode_) {
status_ =
Status::InvalidArgument("Seek() is not allowed in full scan mode.");
offset_ = next_offset_ = table_->data_end_offset_;
return;
} else if (table_->index_size_ > 1) {
} else if (table_->GetIndexSize() > 1) {
assert(false);
status_ = Status::NotSupported(
"PlainTable cannot issue non-prefix seek unless in total order "

@ -19,12 +19,14 @@
#include "rocksdb/table_properties.h"
#include "table/table_reader.h"
#include "table/plain_table_factory.h"
#include "table/plain_table_index.h"
#include "util/arena.h"
#include "util/dynamic_bloom.h"
namespace rocksdb {
class Block;
class BlockContents;
class BlockHandle;
class Footer;
struct Options;
@ -37,6 +39,7 @@ class PlainTableKeyDecoder;
using std::unique_ptr;
using std::unordered_map;
using std::vector;
extern const uint32_t kPlainTableVariableLength;
// Based on following output file format shown in plain_table_factory.h
@ -68,6 +71,7 @@ class PlainTableReader: public TableReader {
uint64_t ApproximateOffsetOf(const Slice& key);
uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
void SetupForCompaction();
std::shared_ptr<const TableProperties> GetTableProperties() const {
@ -93,65 +97,23 @@ class PlainTableReader: public TableReader {
// props: the table properties object that need to be stored. Ownership of
// the object will be passed.
//
// index_ contains buckets size of index_size_, each is a
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
// and the first bit of the integer indicates type of the offset.
//
// +--------------+------------------------------------------------------+
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
// +--------------+------------------------------------------------------+
//
// Explanation for the "flag bit":
//
// 0 indicates that the bucket contains only one prefix (no conflict when
// hashing this prefix), whose first row starts from this offset of the
// file.
// 1 indicates that the bucket contains more than one prefixes, or there
// are too many rows for one prefix so we need a binary search for it. In
// this case, the offset indicates the offset of sub_index_ holding the
// binary search indexes of keys for those rows. Those binary search indexes
// are organized in this way:
//
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
// it, there are N 32-bit integers, each points of an offset of the file,
// which
// points to starting of a row. Those offsets need to be guaranteed to be in
// ascending order so the keys they are pointing to are also in ascending
// order
// to make sure we can use them to do binary searches. Below is visual
// presentation of a bucket.
//
// <begin>
// number_of_records: varint32
// record 1 file offset: fixedint32
// record 2 file offset: fixedint32
// ....
// record N file offset: fixedint32
// <end>
Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness,
size_t huge_page_tlb_size);
Status MmapDataFile();
private:
struct IndexRecord;
class IndexRecordList;
// Plain table maintains an index and a sub index.
// index is implemented by a hash table.
// subindex is a big of memory array.
// For more details about the in-memory index, please refer to:
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
// #wiki-in-memory-index-format
uint32_t* index_;
int index_size_ = 0;
char* sub_index_;
const InternalKeyComparator internal_comparator_;
EncodingType encoding_type_;
// represents plain table's current status.
Status status_;
Slice file_data_;
PlainTableIndex index_;
bool full_scan_mode_;
// data_start_offset_ and data_end_offset_ defines the range of the
// sst file that stores data.
const uint32_t data_start_offset_ = 0;
@ -160,11 +122,6 @@ class PlainTableReader: public TableReader {
const SliceTransform* prefix_extractor_;
static const size_t kNumInternalBytes = 8;
static const uint32_t kSubIndexMask = 0x80000000;
static const size_t kOffsetLen = sizeof(uint32_t);
static const uint64_t kMaxFileSize = 1u << 31;
static const size_t kRecordsPerGroup = 256;
static const int kFullScanModeFlag = -1;
// Bloom filter is used to rule out non-existent key
bool enable_bloom_;
@ -184,6 +141,31 @@ class PlainTableReader: public TableReader {
return user_key_len_ + kNumInternalBytes;
}
Slice GetPrefix(const Slice& target) const {
assert(target.size() >= 8); // target is internal key
return GetPrefixFromUserKey(GetUserKey(target));
}
Slice GetPrefix(const ParsedInternalKey& target) const {
return GetPrefixFromUserKey(target.user_key);
}
Slice GetUserKey(const Slice& key) const {
return Slice(key.data(), key.size() - 8);
}
Slice GetPrefixFromUserKey(const Slice& user_key) const {
if (!IsTotalOrderMode()) {
return prefix_extractor_->Transform(user_key);
} else {
// Use empty slice as prefix if prefix_extractor is not set.
// In that case,
// it falls back to pure binary search and
// total iterator seek is supported.
return Slice();
}
}
friend class TableCache;
friend class PlainTableIterator;
@ -191,33 +173,15 @@ class PlainTableReader: public TableReader {
// the rows, which contains index records as a list.
// If bloom_ is not null, all the keys' full-key hash will be added to the
// bloom filter.
Status PopulateIndexRecordList(IndexRecordList* record_list,
int* num_prefixes, int bloom_bits_per_key,
size_t index_sparseness);
Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
vector<uint32_t>* prefix_hashes);
// Internal helper function to allocate memory for indexes and bloom filters
void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key,
double hash_table_ratio,
size_t huge_page_tlb_size);
// Internal helper function to allocate memory for bloom filter and fill it
void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
size_t huge_page_tlb_size,
vector<uint32_t>* prefix_hashes);
// Internal helper function to bucket index record list to hash buckets.
// bucket_header is a vector of size hash_table_size_, with each entry
// containing a linklist of IndexRecord hashed to the same bucket, in reverse
// order.
// of offsets for the hash, in reversed order.
// entries_per_bucket is sized of index_size_. The value is how many index
// records are there in bucket_headers for the same bucket.
size_t BucketizeIndexesAndFillBloom(
IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
std::vector<uint32_t>* entries_per_bucket);
// Internal helper class to fill the indexes and bloom filters to internal
// data structures. bucket_headers and entries_per_bucket are bucketized
// indexes and counts generated by BucketizeIndexesAndFillBloom().
void FillIndexes(const size_t kSubIndexSize,
const std::vector<IndexRecord*>& bucket_headers,
const std::vector<uint32_t>& entries_per_bucket,
size_t huge_page_tlb_size);
void FillBloom(vector<uint32_t>* prefix_hashes);
// Read the key and value at `offset` to parameters for keys, the and
// `seekable`.
@ -237,28 +201,6 @@ class PlainTableReader: public TableReader {
uint32_t prefix_hash, bool& prefix_matched,
uint32_t* offset) const;
Slice GetUserKey(const Slice& key) const {
return Slice(key.data(), key.size() - 8);
}
Slice GetPrefix(const Slice& target) const {
assert(target.size() >= 8); // target is internal key
return GetPrefixFromUserKey(GetUserKey(target));
}
inline Slice GetPrefix(const ParsedInternalKey& target) const;
Slice GetPrefixFromUserKey(const Slice& user_key) const {
if (!IsTotalOrderMode()) {
return prefix_extractor_->Transform(user_key);
} else {
// Use empty slice as prefix if prefix_extractor is not set. In that case,
// it falls back to pure binary search and total iterator seek is
// supported.
return Slice();
}
}
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
// No copying allowed

@ -48,6 +48,13 @@ DynamicBloom::DynamicBloom(uint32_t num_probes,
kNumProbes(num_probes),
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {}
void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
uint32_t num_blocks) {
data_ = raw_data;
kTotalBits = total_bits;
kNumBlocks = num_blocks;
}
void DynamicBloom::SetTotalBits(Arena* arena,
uint32_t total_bits, uint32_t locality,
size_t huge_page_tlb_size,

@ -5,6 +5,10 @@
#pragma once
#include <string>
#include "rocksdb/slice.h"
#include <util/arena.h>
#include <port/port_posix.h>
@ -57,6 +61,19 @@ class DynamicBloom {
void Prefetch(uint32_t h);
uint32_t GetNumBlocks() const { return kNumBlocks; }
Slice GetRawData() const {
return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
}
void SetRawData(unsigned char* raw_data, uint32_t total_bits,
uint32_t num_blocks = 0);
uint32_t GetTotalBits() const { return kTotalBits; }
bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
private:
uint32_t kTotalBits;
uint32_t kNumBlocks;
@ -81,7 +98,7 @@ inline void DynamicBloom::Prefetch(uint32_t h) {
}
inline bool DynamicBloom::MayContainHash(uint32_t h) const {
assert(kNumBlocks > 0 || kTotalBits > 0);
assert(IsInitialized());
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
if (kNumBlocks != 0) {
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
@ -98,10 +115,6 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
h += delta;
}
} else {
if (kTotalBits == 0) {
// Not initialized.
return true;
}
for (uint32_t i = 0; i < kNumProbes; ++i) {
const uint32_t bitpos = h % kTotalBits;
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
@ -114,7 +127,7 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
}
inline void DynamicBloom::AddHash(uint32_t h) {
assert(kNumBlocks > 0 || kTotalBits > 0);
assert(IsInitialized());
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
if (kNumBlocks != 0) {
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);

@ -17,4 +17,7 @@ namespace rocksdb {
extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
inline uint32_t GetSliceHash(const Slice& s) {
return Hash(s.data(), s.size(), 397);
}
}

Loading…
Cancel
Save