Adding option to save PlainTable index and bloom filter in SST file.

Summary: Adding option to save PlainTable index and bloom filter in SST file. If there is no bloom block and/or index block, PlainTableReader builds new ones. Otherwise PlainTableReader just use these blocks. Test Plan: make all check Reviewers: sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19527
11 years ago · 9d70cce047
parent 92d73cbe78
commit 9d70cce047
17 changed files with 1045 additions and 452 deletions
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@ -23,6 +23,7 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "table/meta_blocks.h"
 #include "table/bloom_block.h"
 #include "table/plain_table_factory.h"
 #include "table/plain_table_reader.h"
 #include "util/hash.h"
@ -70,10 +71,11 @@ class PlainTableDBTest {
    plain_table_options.huge_page_tlb_size = 0;
    plain_table_options.encoding_type = kPrefix;
    plain_table_options.full_scan_mode = false;
    plain_table_options.store_index_in_file = false;
    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 3));
    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
    options.allow_mmap_reads = true;
    return options;
@ -186,6 +188,8 @@ TEST(PlainTableDBTest, Empty) {
  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
 }
 extern const uint64_t kPlainTableMagicNumber;
 class TestPlainTableReader : public PlainTableReader {
 public:
  TestPlainTableReader(const EnvOptions& storage_options,
@ -195,7 +199,8 @@ class TestPlainTableReader : public PlainTableReader {
                       size_t index_sparseness,
                       const TableProperties* table_properties,
                       unique_ptr<RandomAccessFile>&& file,
-                       const Options& options, bool* expect_bloom_not_match)
+                       const Options& options, bool* expect_bloom_not_match,
                       bool store_index_in_file)
      : PlainTableReader(options, std::move(file), storage_options, icomparator,
                         encoding_type, file_size, table_properties),
        expect_bloom_not_match_(expect_bloom_not_match) {
@ -206,6 +211,19 @@ class TestPlainTableReader : public PlainTableReader {
                      bloom_bits_per_key, hash_table_ratio, index_sparseness,
                      2 * 1024 * 1024);
    ASSERT_TRUE(s.ok());
    TableProperties* props = const_cast<TableProperties*>(table_properties);
    if (store_index_in_file) {
      auto bloom_version_ptr = props->user_collected_properties.find(
          PlainTablePropertyNames::kBloomVersion);
      ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
      ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
      if (options.bloom_locality > 0) {
        auto num_blocks_ptr = props->user_collected_properties.find(
            PlainTablePropertyNames::kNumBloomBlocks);
        ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
      }
    }
  }
  virtual ~TestPlainTableReader() {}
@ -213,7 +231,11 @@ class TestPlainTableReader : public PlainTableReader {
 private:
  virtual bool MatchBloom(uint32_t hash) const override {
    bool ret = PlainTableReader::MatchBloom(hash);
-    ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
+    if (*expect_bloom_not_match_) {
      ASSERT_TRUE(!ret);
    } else {
      ASSERT_TRUE(ret);
    }
    return ret;
  }
  bool* expect_bloom_not_match_;
@ -228,6 +250,7 @@ class TestPlainTableFactory : public PlainTableFactory {
        bloom_bits_per_key_(options.bloom_bits_per_key),
        hash_table_ratio_(options.hash_table_ratio),
        index_sparseness_(options.index_sparseness),
        store_index_in_file_(options.store_index_in_file),
        expect_bloom_not_match_(expect_bloom_not_match) {}
  Status NewTableReader(const Options& options, const EnvOptions& soptions,
@ -239,6 +262,20 @@ class TestPlainTableFactory : public PlainTableFactory {
                                 options.env, options.info_log.get(), &props);
    ASSERT_TRUE(s.ok());
    if (store_index_in_file_) {
      BlockHandle bloom_block_handle;
      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
                        options.env, BloomBlockBuilder::kBloomBlock,
                        &bloom_block_handle);
      ASSERT_TRUE(s.ok());
      BlockHandle index_block_handle;
      s = FindMetaBlock(
          file.get(), file_size, kPlainTableMagicNumber, options.env,
          PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
      ASSERT_TRUE(s.ok());
    }
    auto& user_props = props->user_collected_properties;
    auto encoding_type_prop =
        user_props.find(PlainTablePropertyNames::kEncodingType);
@ -249,7 +286,8 @@ class TestPlainTableFactory : public PlainTableFactory {
    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
        soptions, internal_comparator, encoding_type, file_size,
        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), options, expect_bloom_not_match_));
+        std::move(file), options, expect_bloom_not_match_,
        store_index_in_file_));
    *table = std::move(new_reader);
    return s;
@ -259,6 +297,7 @@ class TestPlainTableFactory : public PlainTableFactory {
  int bloom_bits_per_key_;
  double hash_table_ratio_;
  size_t index_sparseness_;
  bool store_index_in_file_;
  bool* expect_bloom_not_match_;
 };
@ -268,59 +307,75 @@ TEST(PlainTableDBTest, Flush) {
    for (EncodingType encoding_type : {kPlain, kPrefix}) {
    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
      for (int total_order = 0; total_order <= 1; total_order++) {
-        Options options = CurrentOptions();
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
-        options.create_if_missing = true;
+             ++store_index_in_file) {
-        // Set only one bucket to force bucket conflict.
+          if (!bloom_bits && store_index_in_file) {
-        // Test index interval for the same prefix to be 1, 2 and 4
+            continue;
-        if (total_order) {
+          }
          options.prefix_extractor.reset();
          PlainTableOptions plain_table_options;
          plain_table_options.user_key_len = 0;
          plain_table_options.bloom_bits_per_key = bloom_bits;
          plain_table_options.hash_table_ratio = 0;
          plain_table_options.index_sparseness = 2;
          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
          plain_table_options.encoding_type = encoding_type;
          plain_table_options.full_scan_mode = false;
          options.table_factory.reset(
              NewPlainTableFactory(plain_table_options));
        } else {
          PlainTableOptions plain_table_options;
          plain_table_options.user_key_len = 0;
          plain_table_options.bloom_bits_per_key = bloom_bits;
          plain_table_options.hash_table_ratio = 0.75;
          plain_table_options.index_sparseness = 16;
          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
          plain_table_options.encoding_type = encoding_type;
          plain_table_options.full_scan_mode = false;
-          options.table_factory.reset(
+          Options options = CurrentOptions();
-              NewPlainTableFactory(plain_table_options));
+          options.create_if_missing = true;
          // Set only one bucket to force bucket conflict.
          // Test index interval for the same prefix to be 1, 2 and 4
          if (total_order) {
            options.prefix_extractor.reset();
            PlainTableOptions plain_table_options;
            plain_table_options.user_key_len = 0;
            plain_table_options.bloom_bits_per_key = bloom_bits;
            plain_table_options.hash_table_ratio = 0;
            plain_table_options.index_sparseness = 2;
            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
            plain_table_options.encoding_type = encoding_type;
            plain_table_options.full_scan_mode = false;
            plain_table_options.store_index_in_file = store_index_in_file;
            options.table_factory.reset(
                NewPlainTableFactory(plain_table_options));
          } else {
            PlainTableOptions plain_table_options;
            plain_table_options.user_key_len = 0;
            plain_table_options.bloom_bits_per_key = bloom_bits;
            plain_table_options.hash_table_ratio = 0.75;
            plain_table_options.index_sparseness = 16;
            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
            plain_table_options.encoding_type = encoding_type;
            plain_table_options.full_scan_mode = false;
            plain_table_options.store_index_in_file = store_index_in_file;
            options.table_factory.reset(
                NewPlainTableFactory(plain_table_options));
          }
          DestroyAndReopen(&options);
          ASSERT_OK(Put("1000000000000foo", "v1"));
          ASSERT_OK(Put("0000000000000bar", "v2"));
          ASSERT_OK(Put("1000000000000foo", "v3"));
          dbfull()->TEST_FlushMemTable();
          TablePropertiesCollection ptc;
          reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
          ASSERT_EQ(1U, ptc.size());
          auto row = ptc.begin();
          auto tp = row->second;
          if (!store_index_in_file) {
            ASSERT_EQ(total_order ? "4" : "12",
                      (tp->user_collected_properties)
                          .at("plain_table_hash_table_size"));
            ASSERT_EQ("0", (tp->user_collected_properties)
                               .at("plain_table_sub_index_size"));
          } else {
            ASSERT_EQ("0", (tp->user_collected_properties)
                               .at("plain_table_hash_table_size"));
            ASSERT_EQ("0", (tp->user_collected_properties)
                               .at("plain_table_sub_index_size"));
          }
          ASSERT_EQ("v3", Get("1000000000000foo"));
          ASSERT_EQ("v2", Get("0000000000000bar"));
        }
        }
        DestroyAndReopen(&options);
        ASSERT_OK(Put("1000000000000foo", "v1"));
        ASSERT_OK(Put("0000000000000bar", "v2"));
        ASSERT_OK(Put("1000000000000foo", "v3"));
        dbfull()->TEST_FlushMemTable();
        TablePropertiesCollection ptc;
        reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
        ASSERT_EQ(1U, ptc.size());
        auto row = ptc.begin();
        auto tp = row->second;
        ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
                                                "plain_table_hash_table_size"));
        ASSERT_EQ("0", (tp->user_collected_properties)
                           .at("plain_table_sub_index_size"));
        ASSERT_EQ("v3", Get("1000000000000foo"));
        ASSERT_EQ("v2", Get("0000000000000bar"));
      }
    }
    }
  }
 }
@ -330,7 +385,15 @@ TEST(PlainTableDBTest, Flush2) {
    for (EncodingType encoding_type : {kPlain, kPrefix}) {
    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
      for (int total_order = 0; total_order <= 1; total_order++) {
-        if (encoding_type == kPrefix && total_order == 1) {
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
             ++store_index_in_file) {
          if (encoding_type == kPrefix && total_order) {
            continue;
          }
          if (!bloom_bits && store_index_in_file) {
            continue;
          }
          if (total_order && store_index_in_file) {
          continue;
        }
        bool expect_bloom_not_match = false;
@ -338,30 +401,23 @@ TEST(PlainTableDBTest, Flush2) {
        options.create_if_missing = true;
        // Set only one bucket to force bucket conflict.
        // Test index interval for the same prefix to be 1, 2 and 4
        PlainTableOptions plain_table_options;
        if (total_order) {
          options.prefix_extractor = nullptr;
          PlainTableOptions plain_table_options;
          plain_table_options.user_key_len = 0;
          plain_table_options.bloom_bits_per_key = bloom_bits;
          plain_table_options.hash_table_ratio = 0;
          plain_table_options.index_sparseness = 2;
          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
          plain_table_options.encoding_type = encoding_type;
          options.table_factory.reset(new TestPlainTableFactory(
              &expect_bloom_not_match, plain_table_options));
        } else {
          PlainTableOptions plain_table_options;
          plain_table_options.user_key_len = 0;
          plain_table_options.bloom_bits_per_key = bloom_bits;
          plain_table_options.hash_table_ratio = 0.75;
          plain_table_options.index_sparseness = 16;
          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
          plain_table_options.encoding_type = encoding_type;
          options.table_factory.reset(new TestPlainTableFactory(
              &expect_bloom_not_match, plain_table_options));
        }
        plain_table_options.user_key_len = kPlainTableVariableLength;
        plain_table_options.bloom_bits_per_key = bloom_bits;
        plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
        plain_table_options.encoding_type = encoding_type;
        plain_table_options.store_index_in_file = store_index_in_file;
        options.table_factory.reset(new TestPlainTableFactory(
            &expect_bloom_not_match, plain_table_options));
        DestroyAndReopen(&options);
        ASSERT_OK(Put("0000000000000bar", "b"));
        ASSERT_OK(Put("1000000000000foo", "v1"));
@ -389,7 +445,6 @@ TEST(PlainTableDBTest, Flush2) {
          // Neither key nor value should exist.
          expect_bloom_not_match = true;
          ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
          // Key doesn't exist any more but prefix exists.
          if (total_order) {
            ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
@ -398,6 +453,7 @@ TEST(PlainTableDBTest, Flush2) {
          expect_bloom_not_match = false;
        }
      }
      }
    }
    }
  }
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -119,6 +119,8 @@ enum EncodingType : char {
 struct PlainTablePropertyNames {
  static const std::string kPrefixExtractorName;
  static const std::string kEncodingType;
  static const std::string kBloomVersion;
  static const std::string kNumBloomBlocks;
 };
 const uint32_t kPlainTableVariableLength = 0;
@ -166,6 +168,11 @@ EncodingType encoding_type = kPlain;
 // @full_scan_mode: mode for reading the whole file one record by one without
 //                  using the index.
  bool full_scan_mode = false;
  // @store_index_in_file: compute plain table index and bloom filter during
  //                       file building and store it in file. When reading
  //                       file, index will be mmaped instead of recomputation.
  bool store_index_in_file = false;
 };
 // -- Plain Table with prefix-only seek
--- a/table/bloom_block.cc
+++ b/table/bloom_block.cc
@ -0,0 +1,23 @@
 //  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #include "table/bloom_block.h"
 #include <string>
 #include "rocksdb/slice.h"
 #include "util/dynamic_bloom.h"
 namespace rocksdb {
 void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t> keys_hashes) {
  for (auto hash : keys_hashes) {
    bloom_.AddHash(hash);
  }
 }
 Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
 const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
 }  // namespace rocksdb
--- a/table/bloom_block.h
+++ b/table/bloom_block.h
@ -0,0 +1,37 @@
 //  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 #include <vector>
 #include <string>
 #include "util/dynamic_bloom.h"
 namespace rocksdb {
 class Logger;
 class BloomBlockBuilder {
 public:
  static const std::string kBloomBlock;
  explicit BloomBlockBuilder(uint32_t num_probes = 6)
      : bloom_(num_probes, nullptr) {}
  void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality,
                    size_t huge_page_tlb_size, Logger* logger) {
    bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size,
                        logger);
  }
  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
  void AddKeysHashes(const std::vector<uint32_t> keys_hashes);
  Slice Finish();
 private:
  DynamicBloom bloom_;
 };
 };  // namespace rocksdb
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@ -273,4 +273,72 @@ Status FindMetaBlock(Iterator* meta_index_iter,
  }
 }
 Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
                     uint64_t table_magic_number, Env* env,
                     const std::string& meta_block_name,
                     BlockHandle* block_handle) {
  Footer footer(table_magic_number);
  auto s = ReadFooterFromFile(file, file_size, &footer);
  if (!s.ok()) {
    return s;
  }
  auto metaindex_handle = footer.metaindex_handle();
  BlockContents metaindex_contents;
  ReadOptions read_options;
  read_options.verify_checksums = false;
  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
                        &metaindex_contents, env, false);
  if (!s.ok()) {
    return s;
  }
  Block metaindex_block(metaindex_contents);
  std::unique_ptr<Iterator> meta_iter;
  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
  return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
 }
 Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
                     uint64_t table_magic_number, Env* env,
                     const std::string& meta_block_name,
                     BlockContents* contents) {
  Footer footer(table_magic_number);
  auto s = ReadFooterFromFile(file, file_size, &footer);
  if (!s.ok()) {
    return s;
  }
  // Reading metaindex block
  auto metaindex_handle = footer.metaindex_handle();
  BlockContents metaindex_contents;
  ReadOptions read_options;
  read_options.verify_checksums = false;
  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
                        &metaindex_contents, env, false);
  if (!s.ok()) {
    return s;
  }
  // Finding metablock
  Block metaindex_block(metaindex_contents);
  std::unique_ptr<Iterator> meta_iter;
  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
  BlockHandle block_handle;
  s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
  if (!s.ok()) {
    return s;
  }
  // Reading metablock
  s = ReadBlockContents(file, footer, read_options, block_handle, contents, env,
                        false);
  return s;
 }
 }  // namespace rocksdb
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@ -15,6 +15,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_builder.h"
 #include "table/format.h"
 namespace rocksdb {
@ -128,4 +129,18 @@ Status FindMetaBlock(Iterator* meta_index_iter,
                     const std::string& meta_block_name,
                     BlockHandle* block_handle);
 // Find the meta block
 Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
                     uint64_t table_magic_number, Env* env,
                     const std::string& meta_block_name,
                     BlockHandle* block_handle);
 // Read the specified meta block with name meta_block_name
 // from `file` and initialize `contents` with contents of this block.
 // Return Status::OK in case of success.
 Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
                     uint64_t table_magic_number, Env* env,
                     const std::string& meta_block_name,
                     BlockContents* contents);
 }  // namespace rocksdb
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@ -6,6 +6,7 @@
 #ifndef ROCKSDB_LITE
 #include "table/plain_table_builder.h"
 #include <string>
 #include <assert.h>
 #include <map>
@ -17,6 +18,8 @@
 #include "table/plain_table_factory.h"
 #include "db/dbformat.h"
 #include "table/block_builder.h"
 #include "table/bloom_block.h"
 #include "table/plain_table_index.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
@ -54,20 +57,36 @@ Status WriteBlock(
 extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
 extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
-PlainTableBuilder::PlainTableBuilder(const Options& options, WritableFile* file,
+PlainTableBuilder::PlainTableBuilder(
-                                     uint32_t user_key_len,
+    const Options& options, WritableFile* file, uint32_t user_key_len,
-                                     EncodingType encoding_type,
+    EncodingType encoding_type, size_t index_sparseness,
-                                     size_t index_sparseness)
+    uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size,
    double hash_table_ratio, bool store_index_in_file)
    : options_(options),
      bloom_block_(num_probes),
      file_(file),
      bloom_bits_per_key_(bloom_bits_per_key),
      huge_page_tlb_size_(huge_page_tlb_size),
      encoder_(encoding_type, user_key_len, options.prefix_extractor.get(),
-               index_sparseness) {
+               index_sparseness),
      store_index_in_file_(store_index_in_file),
      prefix_extractor_(options.prefix_extractor.get()) {
  // Build index block and save it in the file if hash_table_ratio > 0
  if (store_index_in_file_) {
    assert(hash_table_ratio > 0 || IsTotalOrderMode());
    index_builder_.reset(
        new PlainTableIndexBuilder(&arena_, options, index_sparseness,
                                   hash_table_ratio, huge_page_tlb_size_));
    assert(bloom_bits_per_key_ > 0);
    properties_.user_collected_properties
        [PlainTablePropertyNames::kBloomVersion] = "1";  // For future use
  }
  properties_.fixed_key_len = user_key_len;
  // for plain table, we put all the data in a big chuck.
  properties_.num_data_blocks = 1;
-  // emphasize that currently plain table doesn't have persistent index or
+  // Fill it later if store_index_in_file_ == true
  // filter block.
  properties_.index_size = 0;
  properties_.filter_size = 0;
  // To support roll-back to previous version, now still use version 0 for
@ -100,9 +119,28 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
  char meta_bytes_buf[6];
  size_t meta_bytes_buf_size = 0;
  ParsedInternalKey internal_key;
  ParseInternalKey(key, &internal_key);
  // Store key hash
  if (store_index_in_file_) {
    if (options_.prefix_extractor.get() == nullptr) {
      keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
    } else {
      Slice prefix =
          options_.prefix_extractor->Transform(internal_key.user_key);
      keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
    }
  }
  // Write value
  auto prev_offset = offset_;
  // Write out the key
  encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
                     &meta_bytes_buf_size);
  if (SaveIndexInFile()) {
    index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
  }
  // Write value length
  int value_size = value.size();
@ -133,12 +171,51 @@ Status PlainTableBuilder::Finish() {
  properties_.data_size = offset_;
-  // Write the following blocks
+  //  Write the following blocks
-  //  1. [meta block: properties]
+  //  1. [meta block: bloom] - optional
-  //  2. [metaindex block]
+  //  2. [meta block: index] - optional
-  //  3. [footer]
+  //  3. [meta block: properties]
  //  4. [metaindex block]
  //  5. [footer]
  MetaIndexBuilder meta_index_builer;
  if (store_index_in_file_ && (properties_.num_entries > 0)) {
    bloom_block_.SetTotalBits(
        &arena_, properties_.num_entries * bloom_bits_per_key_,
        options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get());
    PutVarint32(&properties_.user_collected_properties
                     [PlainTablePropertyNames::kNumBloomBlocks],
                bloom_block_.GetNumBlocks());
    bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
    BlockHandle bloom_block_handle;
    auto finish_result = bloom_block_.Finish();
    properties_.filter_size = finish_result.size();
    auto s = WriteBlock(finish_result, file_, &offset_, &bloom_block_handle);
    if (!s.ok()) {
      return s;
    }
    BlockHandle index_block_handle;
    finish_result = index_builder_->Finish();
    properties_.index_size = finish_result.size();
    s = WriteBlock(finish_result, file_, &offset_, &index_block_handle);
    if (!s.ok()) {
      return s;
    }
    meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
    meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
                          index_block_handle);
  }
  // Calculate bloom block size and index block size
  PropertyBlockBuilder property_block_builder;
  // -- Add basic properties
  property_block_builder.AddTableProperty(properties_);
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@ -13,6 +13,8 @@
 #include "table/plain_table_key_coding.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/bloom_block.h"
 #include "table/plain_table_index.h"
 namespace rocksdb {
@ -30,7 +32,10 @@ class PlainTableBuilder: public TableBuilder {
  // that the caller does not know which level the output file will reside.
  PlainTableBuilder(const Options& options, WritableFile* file,
                    uint32_t user_key_size, EncodingType encoding_type,
-                    size_t index_sparseness);
+                    size_t index_sparseness, uint32_t bloom_bits_per_key,
                    uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
                    double hash_table_ratio = 0,
                    bool store_index_in_file = false);
  // REQUIRES: Either Finish() or Abandon() has been called.
  ~PlainTableBuilder();
@ -62,18 +67,59 @@ class PlainTableBuilder: public TableBuilder {
  // Finish() call, returns the size of the final generated file.
  uint64_t FileSize() const override;
  bool SaveIndexInFile() const { return store_index_in_file_; }
 private:
  Arena arena_;
  Options options_;
  std::vector<std::unique_ptr<TablePropertiesCollector>>
      table_properties_collectors_;
  BloomBlockBuilder bloom_block_;
  std::unique_ptr<PlainTableIndexBuilder> index_builder_;
  WritableFile* file_;
  uint64_t offset_ = 0;
  uint32_t bloom_bits_per_key_;
  uint32_t huge_page_tlb_size_;
  Status status_;
  TableProperties properties_;
  PlainTableKeyEncoder encoder_;
  bool store_index_in_file_;
  std::vector<uint32_t> keys_or_prefixes_hashes_;
  bool closed_ = false;  // Either Finish() or Abandon() has been called.
  const SliceTransform* prefix_extractor_;
  Slice GetPrefix(const Slice& target) const {
    assert(target.size() >= 8);  // target is internal key
    return GetPrefixFromUserKey(GetUserKey(target));
  }
  Slice GetPrefix(const ParsedInternalKey& target) const {
    return GetPrefixFromUserKey(target.user_key);
  }
  Slice GetUserKey(const Slice& key) const {
    return Slice(key.data(), key.size() - 8);
  }
  Slice GetPrefixFromUserKey(const Slice& user_key) const {
    if (!IsTotalOrderMode()) {
      return prefix_extractor_->Transform(user_key);
    } else {
      // Use empty slice as prefix if prefix_extractor is not set.
      // In that case,
      // it falls back to pure binary search and
      // total iterator seek is supported.
      return Slice();
    }
  }
  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
  // No copying allowed
  PlainTableBuilder(const PlainTableBuilder&) = delete;
  void operator=(const PlainTableBuilder&) = delete;
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@ -30,7 +30,9 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
    const Options& options, const InternalKeyComparator& internal_comparator,
    WritableFile* file, CompressionType compression_type) const {
  return new PlainTableBuilder(options, file, user_key_len_, encoding_type_,
-                               index_sparseness_);
+                               index_sparseness_, bloom_bits_per_key_, 6,
                               huge_page_tlb_size_, hash_table_ratio_,
                               store_index_in_file_);
 }
 extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
@ -43,5 +45,11 @@ const std::string PlainTablePropertyNames::kPrefixExtractorName =
 const std::string PlainTablePropertyNames::kEncodingType =
    "rocksdb.plain.table.encoding.type";
 const std::string PlainTablePropertyNames::kBloomVersion =
    "rocksdb.plain.table.bloom.version";
 const std::string PlainTablePropertyNames::kNumBloomBlocks =
    "rocksdb.plain.table.bloom.numblocks";
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@ -151,7 +151,8 @@ class PlainTableFactory : public TableFactory {
        index_sparseness_(options.index_sparseness),
        huge_page_tlb_size_(options.huge_page_tlb_size),
        encoding_type_(options.encoding_type),
-        full_scan_mode_(options.full_scan_mode) {}
+        full_scan_mode_(options.full_scan_mode),
        store_index_in_file_(options.store_index_in_file) {}
  const char* Name() const override { return "PlainTable"; }
  Status NewTableReader(const Options& options, const EnvOptions& soptions,
                        const InternalKeyComparator& internal_comparator,
@ -173,6 +174,7 @@ class PlainTableFactory : public TableFactory {
  size_t huge_page_tlb_size_;
  EncodingType encoding_type_;
  bool full_scan_mode_;
  bool store_index_in_file_;
 };
 }  // namespace rocksdb
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@ -0,0 +1,196 @@
 //  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #include "table/plain_table_index.h"
 #include "util/coding.h"
 #include "util/hash.h"
 namespace rocksdb {
 namespace {
 inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
  assert(num_buckets > 0);
  return hash % num_buckets;
 }
 }
 void PlainTableIndex::InitFromRawData(Slice data) {
  assert(GetVarint32(&data, &index_size_));
  assert(index_size_ > 0);
  assert(GetVarint32(&data, &num_prefixes_));
  sub_index_size_ = data.size() - index_size_ * kOffsetLen;
  char* index_data_begin = const_cast<char*>(data.data());
  index_ = reinterpret_cast<uint32_t*>(index_data_begin);
  sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
 }
 PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
    uint32_t prefix_hash, uint32_t* bucket_value) const {
  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
  *bucket_value = index_[bucket];
  if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
    *bucket_value ^= kSubIndexMask;
    return kSubindex;
  }
  if (*bucket_value >= kMaxFileSize) {
    return kNoPrefixForBucket;
  } else {
    // point directly to the file
    return kDirectToFile;
  }
 }
 void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash,
                                                        uint32_t offset) {
  if (num_records_in_current_group_ == kNumRecordsPerGroup) {
    current_group_ = AllocateNewGroup();
    num_records_in_current_group_ = 0;
  }
  auto& new_record = current_group_[num_records_in_current_group_++];
  new_record.hash = hash;
  new_record.offset = offset;
  new_record.next = nullptr;
 }
 void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
                                          uint64_t key_offset) {
  if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
    ++num_prefixes_;
    if (!is_first_record_) {
      keys_per_prefix_hist_.Add(num_keys_per_prefix_);
    }
    num_keys_per_prefix_ = 0;
    prev_key_prefix_ = key_prefix_slice.ToString();
    prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
    due_index_ = true;
  }
  if (due_index_) {
    // Add an index key for every kIndexIntervalForSamePrefixKeys keys
    record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
    due_index_ = false;
  }
  num_keys_per_prefix_++;
  if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
    due_index_ = true;
  }
  is_first_record_ = false;
 }
 Slice PlainTableIndexBuilder::Finish() {
  AllocateIndex();
  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
  BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
  keys_per_prefix_hist_.Add(num_keys_per_prefix_);
  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
      keys_per_prefix_hist_.ToString().c_str());
  // From the temp data structure, populate indexes.
  return FillIndexes(hash_to_offsets, entries_per_bucket);
 }
 void PlainTableIndexBuilder::AllocateIndex() {
  if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
    // Fall back to pure binary search if the user fails to specify a prefix
    // extractor.
    index_size_ = 1;
  } else {
    double hash_table_size_multipier = 1.0 / hash_table_ratio_;
    index_size_ = num_prefixes_ * hash_table_size_multipier + 1;
    assert(index_size_ > 0);
  }
 }
 void PlainTableIndexBuilder::BucketizeIndexes(
    std::vector<IndexRecord*>* hash_to_offsets,
    std::vector<uint32_t>* entries_per_bucket) {
  bool first = true;
  uint32_t prev_hash = 0;
  size_t num_records = record_list_.GetNumRecords();
  for (size_t i = 0; i < num_records; i++) {
    IndexRecord* index_record = record_list_.At(i);
    uint32_t cur_hash = index_record->hash;
    if (first || prev_hash != cur_hash) {
      prev_hash = cur_hash;
      first = false;
    }
    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
    index_record->next = prev_bucket_head;
    (*hash_to_offsets)[bucket] = index_record;
    (*entries_per_bucket)[bucket]++;
  }
  sub_index_size_ = 0;
  for (auto entry_count : *entries_per_bucket) {
    if (entry_count <= 1) {
      continue;
    }
    // Only buckets with more than 1 entry will have subindex.
    sub_index_size_ += VarintLength(entry_count);
    // total bytes needed to store these entries' in-file offsets.
    sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
  }
 }
 Slice PlainTableIndexBuilder::FillIndexes(
    const std::vector<IndexRecord*>& hash_to_offsets,
    const std::vector<uint32_t>& entries_per_bucket) {
  Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
      sub_index_size_);
  auto total_allocate_size = GetTotalSize();
  char* allocated = arena_->AllocateAligned(
      total_allocate_size, huge_page_tlb_size_, options_.info_log.get());
  auto temp_ptr = EncodeVarint32(allocated, index_size_);
  uint32_t* index =
      reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
  char* sub_index = reinterpret_cast<char*>(index + index_size_);
  size_t sub_index_offset = 0;
  for (uint32_t i = 0; i < index_size_; i++) {
    uint32_t num_keys_for_bucket = entries_per_bucket[i];
    switch (num_keys_for_bucket) {
      case 0:
        // No key for bucket
        index[i] = PlainTableIndex::kMaxFileSize;
        break;
      case 1:
        // point directly to the file offset
        index[i] = hash_to_offsets[i]->offset;
        break;
      default:
        // point to second level indexes.
        index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask;
        char* prev_ptr = &sub_index[sub_index_offset];
        char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
        sub_index_offset += (cur_ptr - prev_ptr);
        char* sub_index_pos = &sub_index[sub_index_offset];
        IndexRecord* record = hash_to_offsets[i];
        int j;
        for (j = num_keys_for_bucket - 1; j >= 0 && record;
             j--, record = record->next) {
          EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
        }
        assert(j == -1 && record == nullptr);
        sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
        assert(sub_index_offset <= sub_index_size_);
        break;
    }
  }
  assert(sub_index_offset == sub_index_size_);
  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
      index_size_, sub_index_size_);
  return Slice(allocated, GetTotalSize());
 }
 const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
    "PlainTableIndexBlock";
 };  // namespace rocksdb
--- a/table/plain_table_index.h
+++ b/table/plain_table_index.h
@ -0,0 +1,221 @@
 //  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 #include <string>
 #include <vector>
 #include "db/dbformat.h"
 #include "rocksdb/options.h"
 #include "util/murmurhash.h"
 #include "util/hash.h"
 #include "util/arena.h"
 #include "util/histogram.h"
 namespace rocksdb {
 // PlainTableIndex contains buckets size of index_size_, each is a
 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
 // and the first bit of the integer indicates type of the offset.
 //
 // +--------------+------------------------------------------------------+
 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
 // +--------------+------------------------------------------------------+
 //
 // Explanation for the "flag bit":
 //
 // 0 indicates that the bucket contains only one prefix (no conflict when
 //   hashing this prefix), whose first row starts from this offset of the
 // file.
 // 1 indicates that the bucket contains more than one prefixes, or there
 //   are too many rows for one prefix so we need a binary search for it. In
 //   this case, the offset indicates the offset of sub_index_ holding the
 //   binary search indexes of keys for those rows. Those binary search indexes
 //   are organized in this way:
 //
 // The first 4 bytes, indicate how many indexes (N) are stored after it. After
 // it, there are N 32-bit integers, each points of an offset of the file,
 // which
 // points to starting of a row. Those offsets need to be guaranteed to be in
 // ascending order so the keys they are pointing to are also in ascending
 // order
 // to make sure we can use them to do binary searches. Below is visual
 // presentation of a bucket.
 //
 // <begin>
 //   number_of_records:  varint32
 //   record 1 file offset:  fixedint32
 //   record 2 file offset:  fixedint32
 //    ....
 //   record N file offset:  fixedint32
 // <end>
 class PlainTableIndex {
 public:
  enum IndexSearchResult {
    kNoPrefixForBucket = 0,
    kDirectToFile = 1,
    kSubindex = 2
  };
  explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
  PlainTableIndex()
      : index_size_(0),
        sub_index_size_(0),
        num_prefixes_(0),
        index_(nullptr),
        sub_index_(nullptr) {}
  IndexSearchResult GetOffset(uint32_t prefix_hash,
                              uint32_t* bucket_value) const;
  void InitFromRawData(Slice data);
  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
                                              uint32_t* upper_bound) const {
    const char* index_ptr = &sub_index_[offset];
    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
  }
  uint32_t GetIndexSize() const { return index_size_; }
  uint32_t GetSubIndexSize() const { return sub_index_size_; }
  uint32_t GetNumPrefixes() const { return num_prefixes_; }
  static const uint64_t kMaxFileSize = (1u << 31) - 1;
  static const uint32_t kSubIndexMask = 0x80000000;
  static const size_t kOffsetLen = sizeof(uint32_t);
 private:
  uint32_t index_size_;
  size_t sub_index_size_;
  uint32_t num_prefixes_;
  uint32_t* index_;
  char* sub_index_;
 };
 // PlainTableIndexBuilder is used to create plain table index.
 // After calling Finish(), it returns Slice, which is usually
 // used either to initialize PlainTableIndex or
 // to save index to sst file.
 // For more details about the  index, please refer to:
 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
 // #wiki-in-memory-index-format
 class PlainTableIndexBuilder {
 public:
  PlainTableIndexBuilder(Arena* arena, const Options& options,
                         uint32_t index_sparseness, double hash_table_ratio,
                         double huge_page_tlb_size)
      : arena_(arena),
        options_(options),
        record_list_(kRecordsPerGroup),
        is_first_record_(true),
        due_index_(false),
        num_prefixes_(0),
        num_keys_per_prefix_(0),
        prev_key_prefix_hash_(0),
        index_sparseness_(index_sparseness),
        prefix_extractor_(options.prefix_extractor.get()),
        hash_table_ratio_(hash_table_ratio),
        huge_page_tlb_size_(huge_page_tlb_size) {}
  void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset);
  Slice Finish();
  uint32_t GetTotalSize() const {
    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
  }
  static const std::string kPlainTableIndexBlock;
 private:
  struct IndexRecord {
    uint32_t hash;    // hash of the prefix
    uint32_t offset;  // offset of a row
    IndexRecord* next;
  };
  // Helper class to track all the index records
  class IndexRecordList {
   public:
    explicit IndexRecordList(size_t num_records_per_group)
        : kNumRecordsPerGroup(num_records_per_group),
          current_group_(nullptr),
          num_records_in_current_group_(num_records_per_group) {}
    ~IndexRecordList() {
      for (size_t i = 0; i < groups_.size(); i++) {
        delete[] groups_[i];
      }
    }
    void AddRecord(murmur_t hash, uint32_t offset);
    size_t GetNumRecords() const {
      return (groups_.size() - 1) * kNumRecordsPerGroup +
             num_records_in_current_group_;
    }
    IndexRecord* At(size_t index) {
      return &(groups_[index / kNumRecordsPerGroup]
                      [index % kNumRecordsPerGroup]);
    }
   private:
    IndexRecord* AllocateNewGroup() {
      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
      groups_.push_back(result);
      return result;
    }
    // Each group in `groups_` contains fix-sized records (determined by
    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
    // occurs.
    const size_t kNumRecordsPerGroup;
    IndexRecord* current_group_;
    // List of arrays allocated
    std::vector<IndexRecord*> groups_;
    size_t num_records_in_current_group_;
  };
  void AllocateIndex();
  // Internal helper function to bucket index record list to hash buckets.
  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
                        std::vector<uint32_t>* entries_per_bucket);
  // Internal helper class to fill the indexes and bloom filters to internal
  // data structures.
  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
                    const std::vector<uint32_t>& entries_per_bucket);
  Arena* arena_;
  Options options_;
  HistogramImpl keys_per_prefix_hist_;
  IndexRecordList record_list_;
  bool is_first_record_;
  bool due_index_;
  uint32_t num_prefixes_;
  uint32_t num_keys_per_prefix_;
  uint32_t prev_key_prefix_hash_;
  uint32_t index_sparseness_;
  uint32_t index_size_;
  size_t sub_index_size_;
  const SliceTransform* prefix_extractor_;
  double hash_table_ratio_;
  double huge_page_tlb_size_;
  std::string prev_key_prefix_;
  static const size_t kRecordsPerGroup = 256;
 };
 };  // namespace rocksdb
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@ -3,6 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #ifndef ROCKSDB_LITE
 #include "table/plain_table_reader.h"
 #include <string>
@ -18,6 +19,7 @@
 #include "rocksdb/statistics.h"
 #include "table/block.h"
 #include "table/bloom_block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
@ -39,15 +41,6 @@ namespace rocksdb {
 namespace {
 inline uint32_t GetSliceHash(const Slice& s) {
  return Hash(s.data(), s.size(), 397) ;
 }
 inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
  assert(num_buckets >= 0);
  return hash % num_buckets;
 }
 // Safely getting a uint32_t element from a char array, where, starting from
 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
 inline uint32_t GetFixed32Element(const char* base, size_t offset) {
@ -103,6 +96,7 @@ PlainTableReader::PlainTableReader(const Options& options,
                                   const TableProperties* table_properties)
    : internal_comparator_(icomparator),
      encoding_type_(encoding_type),
      full_scan_mode_(false),
      data_end_offset_(table_properties->data_size),
      user_key_len_(table_properties->fixed_key_len),
      prefix_extractor_(options.prefix_extractor.get()),
@ -126,8 +120,7 @@ Status PlainTableReader::Open(const Options& options,
                              double hash_table_ratio, size_t index_sparseness,
                              size_t huge_page_tlb_size, bool full_scan_mode) {
  assert(options.allow_mmap_reads);
-
+  if (file_size > PlainTableIndex::kMaxFileSize) {
  if (file_size > kMaxFileSize) {
    return Status::NotSupported("File is too large for PlainTableReader!");
  }
@ -173,7 +166,6 @@ Status PlainTableReader::Open(const Options& options,
    return s;
  }
  // -- Populate Index
  if (!full_scan_mode) {
    s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
                                  index_sparseness, huge_page_tlb_size);
@ -183,7 +175,7 @@ Status PlainTableReader::Open(const Options& options,
  } else {
    // Flag to indicate it is a full scan mode so that none of the indexes
    // can be used.
-    new_reader->index_size_ = kFullScanModeFlag;
+    new_reader->full_scan_mode_ = true;
  }
  *table_reader = std::move(new_reader);
@ -203,79 +195,15 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
  }
 }
-struct PlainTableReader::IndexRecord {
+Status PlainTableReader::PopulateIndexRecordList(
-  uint32_t hash; // hash of the prefix
+    PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
  uint32_t offset; // offset of a row
  IndexRecord* next;
 };
 // Helper class to track all the index records
 class PlainTableReader::IndexRecordList {
 public:
  explicit IndexRecordList(size_t num_records_per_group)
      : kNumRecordsPerGroup(num_records_per_group),
        current_group_(nullptr),
        num_records_in_current_group_(num_records_per_group) {}
  ~IndexRecordList() {
    for (size_t i = 0; i < groups_.size(); i++) {
      delete[] groups_[i];
    }
  }
  void AddRecord(murmur_t hash, uint32_t offset) {
    if (num_records_in_current_group_ == kNumRecordsPerGroup) {
      current_group_ = AllocateNewGroup();
      num_records_in_current_group_ = 0;
    }
    auto& new_record = current_group_[num_records_in_current_group_++];
    new_record.hash = hash;
    new_record.offset = offset;
    new_record.next = nullptr;
  }
  size_t GetNumRecords() const {
    return (groups_.size() - 1) * kNumRecordsPerGroup +
           num_records_in_current_group_;
  }
  IndexRecord* At(size_t index) {
    return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
  }
 private:
  IndexRecord* AllocateNewGroup() {
    IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
    groups_.push_back(result);
    return result;
  }
  // Each group in `groups_` contains fix-sized records (determined by
  // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
  // occurs.
  const size_t kNumRecordsPerGroup;
  IndexRecord* current_group_;
  // List of arrays allocated
  std::vector<IndexRecord*> groups_;
  size_t num_records_in_current_group_;
 };
 Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
                                                 int* num_prefixes,
                                                 int bloom_bits_per_key,
                                                 size_t index_sparseness) {
  Slice prev_key_prefix_slice;
  uint32_t prev_key_prefix_hash = 0;
  uint32_t pos = data_start_offset_;
  int num_keys_per_prefix = 0;
  bool is_first_record = true;
  HistogramImpl keys_per_prefix_hist;
  // Need map to be ordered to make sure sub indexes generated
  // are in order.
-  *num_prefixes = 0;
+  bool is_first_record = true;
  Slice key_prefix_slice;
  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
                               options_.prefix_extractor.get());
  bool due_index = false;
  while (pos < data_end_offset_) {
    uint32_t key_offset = pos;
    ParsedInternalKey key;
@ -285,152 +213,53 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
    if (!s.ok()) {
      return s;
    }
    key_prefix_slice = GetPrefix(key);
    if (enable_bloom_) {
      // total order mode and bloom filter is enabled.
      bloom_.AddHash(GetSliceHash(key.user_key));
-    }
+    } else {
-    Slice key_prefix_slice = GetPrefix(key);
+      if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
-
+        if (!is_first_record) {
-    if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+          prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
-      ++(*num_prefixes);
+        }
-      if (!is_first_record) {
+        prev_key_prefix_slice = key_prefix_slice;
        keys_per_prefix_hist.Add(num_keys_per_prefix);
      }
      num_keys_per_prefix = 0;
      prev_key_prefix_slice = key_prefix_slice;
      prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
      due_index = true;
    }
-    if (due_index) {
+    index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
      if (!seekable) {
        return Status::Corruption("Key for a prefix is not seekable");
      }
      // Add an index key for every kIndexIntervalForSamePrefixKeys keys
      record_list->AddRecord(prev_key_prefix_hash, key_offset);
      due_index = false;
    }
-    num_keys_per_prefix++;
+    if (!seekable && is_first_record) {
-    if (index_sparseness == 0 || num_keys_per_prefix % index_sparseness == 0) {
+      return Status::Corruption("Key for a prefix is not seekable");
      due_index = true;
    }
    is_first_record = false;
  }
-  keys_per_prefix_hist.Add(num_keys_per_prefix);
+  prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
-  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+  index_.InitFromRawData(index_builder->Finish());
      keys_per_prefix_hist.ToString().c_str());
  return Status::OK();
 }
-void PlainTableReader::AllocateIndexAndBloom(int num_prefixes,
+void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
-                                             int bloom_bits_per_key,
+                                            int num_prefixes,
-                                             double hash_table_ratio,
+                                            size_t huge_page_tlb_size,
-                                             size_t huge_page_tlb_size) {
+                                            vector<uint32_t>* prefix_hashes) {
-  if (prefix_extractor_ != nullptr) {
+  if (!IsTotalOrderMode()) {
    uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
    if (bloom_total_bits > 0) {
      enable_bloom_ = true;
      bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality,
                          huge_page_tlb_size, options_.info_log.get());
      FillBloom(prefix_hashes);
    }
  }
  if (prefix_extractor_ == nullptr || hash_table_ratio <= 0) {
    // Fall back to pure binary search if the user fails to specify a prefix
    // extractor.
    index_size_ = 1;
  } else {
    double hash_table_size_multipier = 1.0 / hash_table_ratio;
    index_size_ = num_prefixes * hash_table_size_multipier + 1;
  }
 }
 size_t PlainTableReader::BucketizeIndexesAndFillBloom(
    IndexRecordList* record_list, std::vector<IndexRecord*>* hash_to_offsets,
    std::vector<uint32_t>* entries_per_bucket) {
  bool first = true;
  uint32_t prev_hash = 0;
  size_t num_records = record_list->GetNumRecords();
  for (size_t i = 0; i < num_records; i++) {
    IndexRecord* index_record = record_list->At(i);
    uint32_t cur_hash = index_record->hash;
    if (first || prev_hash != cur_hash) {
      prev_hash = cur_hash;
      first = false;
      if (enable_bloom_ && !IsTotalOrderMode()) {
        bloom_.AddHash(cur_hash);
      }
    }
    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
    index_record->next = prev_bucket_head;
    (*hash_to_offsets)[bucket] = index_record;
    (*entries_per_bucket)[bucket]++;
  }
  size_t sub_index_size = 0;
  for (auto entry_count : *entries_per_bucket) {
    if (entry_count <= 1) {
      continue;
    }
    // Only buckets with more than 1 entry will have subindex.
    sub_index_size += VarintLength(entry_count);
    // total bytes needed to store these entries' in-file offsets.
    sub_index_size += entry_count * kOffsetLen;
  }
  return sub_index_size;
 }
-void PlainTableReader::FillIndexes(
+void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
-    const size_t kSubIndexSize,
+  assert(bloom_.IsInitialized());
-    const std::vector<IndexRecord*>& hash_to_offsets,
+  for (auto prefix_hash : *prefix_hashes) {
-    const std::vector<uint32_t>& entries_per_bucket,
+    bloom_.AddHash(prefix_hash);
    size_t huge_page_tlb_size) {
  Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
      kSubIndexSize);
  auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
  char* allocated = arena_.AllocateAligned(
      total_allocate_size, huge_page_tlb_size, options_.info_log.get());
  index_ = reinterpret_cast<uint32_t*>(allocated);
  sub_index_ = allocated + sizeof(uint32_t) * index_size_;
  size_t sub_index_offset = 0;
  for (int i = 0; i < index_size_; i++) {
    uint32_t num_keys_for_bucket = entries_per_bucket[i];
    switch (num_keys_for_bucket) {
    case 0:
      // No key for bucket
      index_[i] = data_end_offset_;
      break;
    case 1:
      // point directly to the file offset
      index_[i] = hash_to_offsets[i]->offset;
      break;
    default:
      // point to second level indexes.
      index_[i] = sub_index_offset | kSubIndexMask;
      char* prev_ptr = &sub_index_[sub_index_offset];
      char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
      sub_index_offset += (cur_ptr - prev_ptr);
      char* sub_index_pos = &sub_index_[sub_index_offset];
      IndexRecord* record = hash_to_offsets[i];
      int j;
      for (j = num_keys_for_bucket - 1; j >= 0 && record;
           j--, record = record->next) {
        EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
      }
      assert(j == -1 && record == nullptr);
      sub_index_offset += kOffsetLen * num_keys_for_bucket;
      assert(sub_index_offset <= kSubIndexSize);
      break;
    }
  }
  assert(sub_index_offset == kSubIndexSize);
  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
      index_size_, kSubIndexSize);
 }
 Status PlainTableReader::MmapDataFile() {
@ -445,59 +274,111 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
                                       size_t huge_page_tlb_size) {
  assert(props != nullptr);
  table_properties_.reset(props);
  // options.prefix_extractor is requried for a hash-based look-up.
  if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
  BlockContents bloom_block_contents;
  auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
                         options_.env, BloomBlockBuilder::kBloomBlock,
                         &bloom_block_contents);
  bool index_in_file = s.ok();
  BlockContents index_block_contents;
  s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
                    options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
                    &index_block_contents);
  index_in_file &= s.ok();
  Slice* bloom_block;
  if (index_in_file) {
    bloom_block = &bloom_block_contents.data;
  } else {
    bloom_block = nullptr;
  }
  // index_in_file == true only if there are kBloomBlock and
  // kPlainTableIndexBlock
  // in file
  Slice* index_block;
  if (index_in_file) {
    index_block = &index_block_contents.data;
  } else {
    index_block = nullptr;
  }
  if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
  // options.prefix_extractor is requried for a hash-based look-up.
    return Status::NotSupported(
        "PlainTable requires a prefix extractor enable prefix hash mode.");
  }
  IndexRecordList record_list(kRecordsPerGroup);
  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
  // for a prefix (starting from the first one), generate a record of (hash,
  // offset) and append it to IndexRecordList, which is a data structure created
  // to store them.
  int num_prefixes;
-  // Allocate bloom filter here for total order mode.
+  if (!index_in_file) {
-  if (IsTotalOrderMode()) {
+    // Allocate bloom filter here for total order mode.
-    uint32_t num_bloom_bits =
+    if (IsTotalOrderMode()) {
-        table_properties_->num_entries * bloom_bits_per_key;
+      uint32_t num_bloom_bits =
-    if (num_bloom_bits > 0) {
+          table_properties_->num_entries * bloom_bits_per_key;
-      enable_bloom_ = true;
+      if (num_bloom_bits > 0) {
-      bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality,
+        enable_bloom_ = true;
-                          huge_page_tlb_size, options_.info_log.get());
+        bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality,
                            huge_page_tlb_size, options_.info_log.get());
      }
    }
  } else {
    enable_bloom_ = true;
    auto num_blocks_property = props->user_collected_properties.find(
        PlainTablePropertyNames::kNumBloomBlocks);
    uint32_t num_blocks = 0;
    if (num_blocks_property != props->user_collected_properties.end()) {
      Slice temp_slice(num_blocks_property->second);
      if (!GetVarint32(&temp_slice, &num_blocks)) {
        num_blocks = 0;
      }
    }
    // cast away const qualifier, because bloom_ won't be changed
    bloom_.SetRawData(
        const_cast<unsigned char*>(
            reinterpret_cast<const unsigned char*>(bloom_block->data())),
        bloom_block->size() * 8, num_blocks);
  }
-  Status s = PopulateIndexRecordList(&record_list, &num_prefixes,
+  PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness,
-                                     bloom_bits_per_key, index_sparseness);
+                                       hash_table_ratio, huge_page_tlb_size);
-  if (!s.ok()) {
+
-    return s;
+  std::vector<uint32_t> prefix_hashes;
  if (!index_in_file) {
    Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
    if (!s.ok()) {
      return s;
    }
  } else {
    index_.InitFromRawData(*index_block);
  }
  if (!index_in_file) {
    // Calculated bloom filter size and allocate memory for
    // bloom filter based on the number of prefixes, then fill it.
    AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
                         huge_page_tlb_size, &prefix_hashes);
  }
  // Calculated hash table and bloom filter size and allocate memory for indexes
  // and bloom filter based on the number of prefixes.
  AllocateIndexAndBloom(num_prefixes, bloom_bits_per_key, hash_table_ratio,
                        huge_page_tlb_size);
  // Bucketize all the index records to a temp data structure, in which for
  // each bucket, we generate a linked list of IndexRecord, in reversed order.
  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
      &record_list, &hash_to_offsets, &entries_per_bucket);
  // From the temp data structure, populate indexes.
  FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket,
              huge_page_tlb_size);
  // Fill two table properties.
-  // TODO(sdong): after we have the feature of storing index in file, this
+  if (!index_in_file) {
-  // properties need to be populated to index_size instead.
+    props->user_collected_properties["plain_table_hash_table_size"] =
-  props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
-      std::to_string(index_size_ * 4U);
+    props->user_collected_properties["plain_table_sub_index_size"] =
-  props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(index_.GetSubIndexSize());
-      std::to_string(sub_index_size_needed);
+  } else {
    props->user_collected_properties["plain_table_hash_table_size"] =
        std::to_string(0);
    props->user_collected_properties["plain_table_sub_index_size"] =
        std::to_string(0);
  }
  return Status::OK();
 }
@ -506,24 +387,21 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
                                   uint32_t prefix_hash, bool& prefix_matched,
                                   uint32_t* offset) const {
  prefix_matched = false;
-  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+  uint32_t prefix_index_offset;
-  uint32_t bucket_value = index_[bucket];
+  auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
-  if (bucket_value == data_end_offset_) {
+  if (res == PlainTableIndex::kNoPrefixForBucket) {
    *offset = data_end_offset_;
    return Status::OK();
-  } else if ((bucket_value & kSubIndexMask) == 0) {
+  } else if (res == PlainTableIndex::kDirectToFile) {
-    // point directly to the file
+    *offset = prefix_index_offset;
    *offset = bucket_value;
    return Status::OK();
  }
  // point to sub-index, need to do a binary search
  uint32_t upper_bound;
  const char* base_ptr =
      index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
  uint32_t low = 0;
  uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
  const char* index_ptr = &sub_index_[prefix_index_offset];
  uint32_t upper_bound = 0;
  const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound);
  uint32_t high = upper_bound;
  ParsedInternalKey mid_key;
  ParsedInternalKey parsed_target;
@ -593,9 +471,6 @@ bool PlainTableReader::MatchBloom(uint32_t hash) const {
  return !enable_bloom_ || bloom_.MayContainHash(hash);
 }
 Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const {
  return GetPrefixFromUserKey(target.user_key);
 }
 Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
                              ParsedInternalKey* parsed_key,
@ -650,8 +525,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
  Slice prefix_slice;
  uint32_t prefix_hash;
  if (IsTotalOrderMode()) {
-    if (index_size_ == kFullScanModeFlag) {
+    if (full_scan_mode_) {
      // Full Scan Mode
      status_ =
          Status::InvalidArgument("Get() is not allowed in full scan mode.");
    }
@ -682,7 +556,6 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
  if (!ParseInternalKey(target, &parsed_target)) {
    return Status::Corruption(Slice());
  }
  Slice found_value;
  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
                               options_.prefix_extractor.get());
@ -747,13 +620,12 @@ void PlainTableIterator::Seek(const Slice& target) {
  // If the user doesn't set prefix seek option and we are not able to do a
  // total Seek(). assert failure.
  if (!use_prefix_seek_) {
-    if (table_->index_size_ == PlainTableReader::kFullScanModeFlag) {
+    if (table_->full_scan_mode_) {
      // Full Scan Mode.
      status_ =
          Status::InvalidArgument("Seek() is not allowed in full scan mode.");
      offset_ = next_offset_ = table_->data_end_offset_;
      return;
-    } else if (table_->index_size_ > 1) {
+    } else if (table_->GetIndexSize() > 1) {
      assert(false);
      status_ = Status::NotSupported(
          "PlainTable cannot issue non-prefix seek unless in total order "
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@ -19,12 +19,14 @@
 #include "rocksdb/table_properties.h"
 #include "table/table_reader.h"
 #include "table/plain_table_factory.h"
 #include "table/plain_table_index.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
 namespace rocksdb {
 class Block;
 class BlockContents;
 class BlockHandle;
 class Footer;
 struct Options;
@ -37,6 +39,7 @@ class PlainTableKeyDecoder;
 using std::unique_ptr;
 using std::unordered_map;
 using std::vector;
 extern const uint32_t kPlainTableVariableLength;
 // Based on following output file format shown in plain_table_factory.h
@ -68,6 +71,7 @@ class PlainTableReader: public TableReader {
  uint64_t ApproximateOffsetOf(const Slice& key);
  uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
  void SetupForCompaction();
  std::shared_ptr<const TableProperties> GetTableProperties() const {
@ -93,65 +97,23 @@ class PlainTableReader: public TableReader {
  // props: the table properties object that need to be stored. Ownership of
  //        the object will be passed.
  //
-  // index_ contains buckets size of index_size_, each is a
+
  // 32-bit integer. The lower 31 bits contain an offset value (explained below)
  // and the first bit of the integer indicates type of the offset.
  //
  // +--------------+------------------------------------------------------+
  // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
  // +--------------+------------------------------------------------------+
  //
  // Explanation for the "flag bit":
  //
  // 0 indicates that the bucket contains only one prefix (no conflict when
  //   hashing this prefix), whose first row starts from this offset of the
  // file.
  // 1 indicates that the bucket contains more than one prefixes, or there
  //   are too many rows for one prefix so we need a binary search for it. In
  //   this case, the offset indicates the offset of sub_index_ holding the
  //   binary search indexes of keys for those rows. Those binary search indexes
  //   are organized in this way:
  //
  // The first 4 bytes, indicate how many indexes (N) are stored after it. After
  // it, there are N 32-bit integers, each points of an offset of the file,
  // which
  // points to starting of a row. Those offsets need to be guaranteed to be in
  // ascending order so the keys they are pointing to are also in ascending
  // order
  // to make sure we can use them to do binary searches. Below is visual
  // presentation of a bucket.
  //
  // <begin>
  //   number_of_records:  varint32
  //   record 1 file offset:  fixedint32
  //   record 2 file offset:  fixedint32
  //    ....
  //   record N file offset:  fixedint32
  // <end>
  Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
                       double hash_table_ratio, size_t index_sparseness,
                       size_t huge_page_tlb_size);
  Status MmapDataFile();
 private:
  struct IndexRecord;
  class IndexRecordList;
  // Plain table maintains an index and a sub index.
  // index is implemented by a hash table.
  // subindex is a big of memory array.
  // For more details about the in-memory index, please refer to:
  // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
  // #wiki-in-memory-index-format
  uint32_t* index_;
  int index_size_ = 0;
  char* sub_index_;
  const InternalKeyComparator internal_comparator_;
  EncodingType encoding_type_;
  // represents plain table's current status.
  Status status_;
  Slice file_data_;
  PlainTableIndex index_;
  bool full_scan_mode_;
  // data_start_offset_ and data_end_offset_ defines the range of the
  // sst file that stores data.
  const uint32_t data_start_offset_ = 0;
@ -160,11 +122,6 @@ class PlainTableReader: public TableReader {
  const SliceTransform* prefix_extractor_;
  static const size_t kNumInternalBytes = 8;
  static const uint32_t kSubIndexMask = 0x80000000;
  static const size_t kOffsetLen = sizeof(uint32_t);
  static const uint64_t kMaxFileSize = 1u << 31;
  static const size_t kRecordsPerGroup = 256;
  static const int kFullScanModeFlag = -1;
  // Bloom filter is used to rule out non-existent key
  bool enable_bloom_;
@ -184,6 +141,31 @@ class PlainTableReader: public TableReader {
    return user_key_len_ + kNumInternalBytes;
  }
  Slice GetPrefix(const Slice& target) const {
    assert(target.size() >= 8);  // target is internal key
    return GetPrefixFromUserKey(GetUserKey(target));
  }
  Slice GetPrefix(const ParsedInternalKey& target) const {
    return GetPrefixFromUserKey(target.user_key);
  }
  Slice GetUserKey(const Slice& key) const {
    return Slice(key.data(), key.size() - 8);
  }
  Slice GetPrefixFromUserKey(const Slice& user_key) const {
    if (!IsTotalOrderMode()) {
      return prefix_extractor_->Transform(user_key);
    } else {
      // Use empty slice as prefix if prefix_extractor is not set.
      // In that case,
      // it falls back to pure binary search and
      // total iterator seek is supported.
      return Slice();
    }
  }
  friend class TableCache;
  friend class PlainTableIterator;
@ -191,33 +173,15 @@ class PlainTableReader: public TableReader {
  // the rows, which contains index records as a list.
  // If bloom_ is not null, all the keys' full-key hash will be added to the
  // bloom filter.
-  Status PopulateIndexRecordList(IndexRecordList* record_list,
+  Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
-                                 int* num_prefixes, int bloom_bits_per_key,
+                                 vector<uint32_t>* prefix_hashes);
-                                 size_t index_sparseness);
+
-
+  // Internal helper function to allocate memory for bloom filter and fill it
-  // Internal helper function to allocate memory for indexes and bloom filters
+  void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
-  void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key,
+                            size_t huge_page_tlb_size,
-                             double hash_table_ratio,
+                            vector<uint32_t>* prefix_hashes);
-                             size_t huge_page_tlb_size);
+
-
+  void FillBloom(vector<uint32_t>* prefix_hashes);
  // Internal helper function to bucket index record list to hash buckets.
  // bucket_header is a vector of size hash_table_size_, with each entry
  // containing a linklist of IndexRecord hashed to the same bucket, in reverse
  // order.
  // of offsets for the hash, in reversed order.
  // entries_per_bucket is sized of index_size_. The value is how many index
  // records are there in bucket_headers for the same bucket.
  size_t BucketizeIndexesAndFillBloom(
      IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
      std::vector<uint32_t>* entries_per_bucket);
  // Internal helper class to fill the indexes and bloom filters to internal
  // data structures. bucket_headers and entries_per_bucket are bucketized
  // indexes and counts generated by BucketizeIndexesAndFillBloom().
  void FillIndexes(const size_t kSubIndexSize,
                   const std::vector<IndexRecord*>& bucket_headers,
                   const std::vector<uint32_t>& entries_per_bucket,
                   size_t huge_page_tlb_size);
  // Read the key and value at `offset` to parameters for keys, the and
  // `seekable`.
@ -237,28 +201,6 @@ class PlainTableReader: public TableReader {
                   uint32_t prefix_hash, bool& prefix_matched,
                   uint32_t* offset) const;
  Slice GetUserKey(const Slice& key) const {
    return Slice(key.data(), key.size() - 8);
  }
  Slice GetPrefix(const Slice& target) const {
    assert(target.size() >= 8);  // target is internal key
    return GetPrefixFromUserKey(GetUserKey(target));
  }
  inline Slice GetPrefix(const ParsedInternalKey& target) const;
  Slice GetPrefixFromUserKey(const Slice& user_key) const {
    if (!IsTotalOrderMode()) {
      return prefix_extractor_->Transform(user_key);
    } else {
      // Use empty slice as prefix if prefix_extractor is not set. In that case,
      // it falls back to pure binary search and total iterator seek is
      // supported.
      return Slice();
    }
  }
  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
  // No copying allowed
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@ -48,6 +48,13 @@ DynamicBloom::DynamicBloom(uint32_t num_probes,
      kNumProbes(num_probes),
      hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {}
 void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
                              uint32_t num_blocks) {
  data_ = raw_data;
  kTotalBits = total_bits;
  kNumBlocks = num_blocks;
 }
 void DynamicBloom::SetTotalBits(Arena* arena,
                                uint32_t total_bits, uint32_t locality,
                                size_t huge_page_tlb_size,
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@ -5,6 +5,10 @@
 #pragma once
 #include <string>
 #include "rocksdb/slice.h"
 #include <util/arena.h>
 #include <port/port_posix.h>
@ -57,6 +61,19 @@ class DynamicBloom {
  void Prefetch(uint32_t h);
  uint32_t GetNumBlocks() const { return kNumBlocks; }
  Slice GetRawData() const {
    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
  }
  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
                  uint32_t num_blocks = 0);
  uint32_t GetTotalBits() const { return kTotalBits; }
  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
 private:
  uint32_t kTotalBits;
  uint32_t kNumBlocks;
@ -81,7 +98,7 @@ inline void DynamicBloom::Prefetch(uint32_t h) {
 }
 inline bool DynamicBloom::MayContainHash(uint32_t h) const {
-  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(IsInitialized());
  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
  if (kNumBlocks != 0) {
    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
@ -98,10 +115,6 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
      h += delta;
    }
  } else {
    if (kTotalBits == 0) {
      // Not initialized.
      return true;
    }
    for (uint32_t i = 0; i < kNumProbes; ++i) {
      const uint32_t bitpos = h % kTotalBits;
      if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
@ -114,7 +127,7 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
 }
 inline void DynamicBloom::AddHash(uint32_t h) {
-  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(IsInitialized());
  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
  if (kNumBlocks != 0) {
    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
--- a/util/hash.h
+++ b/util/hash.h
@ -17,4 +17,7 @@ namespace rocksdb {
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 inline uint32_t GetSliceHash(const Slice& s) {
  return Hash(s.data(), s.size(), 397);
 }
 }