Refactoring

Summary: This is the first split of https://github.com/facebook/rocksdb/pull/1891 and will be needed for the upcoming partitioned filter patch. Closes https://github.com/facebook/rocksdb/pull/1949 Differential Revision: D4652152 Pulled By: maysamyabandeh fbshipit-source-id: 9801778
9 years ago · a2f7a514d1
parent 2a5daa06f0
commit a2f7a514d1
13 changed files with 678 additions and 559 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -336,10 +336,12 @@ set(SOURCES
        table/format.cc
        table/full_filter_block.cc
        table/get_context.cc
        table/index_builder.cc
        table/iterator.cc
        table/merging_iterator.cc
        table/sst_file_writer.cc
        table/meta_blocks.cc
        table/partitioned_filter_block.cc
        table/plain_table_builder.cc
        table/plain_table_factory.cc
        table/plain_table_index.cc
--- a/src.mk
+++ b/src.mk
@ -72,10 +72,12 @@ LIB_SOURCES =                                                   \
  table/format.cc                                               \
  table/full_filter_block.cc                                    \
  table/get_context.cc                                          \
  table/index_builder.cc                                        \
  table/iterator.cc                                             \
  table/merging_iterator.cc                                     \
  table/meta_blocks.cc                                          \
  table/sst_file_writer.cc                                      \
  table/partitioned_filter_block.cc                             \
  table/plain_table_builder.cc                                  \
  table/plain_table_factory.cc                                  \
  table/plain_table_index.cc                                    \
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -31,14 +31,16 @@
 #include "rocksdb/table.h"
 #include "table/block.h"
 #include "table/block_based_filter_block.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/block_based_filter_block.h"
 #include "table/block_based_table_factory.h"
 #include "table/full_filter_block.h"
 #include "table/format.h"
 #include "table/full_filter_block.h"
 #include "table/index_builder.h"
 #include "table/meta_blocks.h"
 #include "table/partitioned_filter_block.h"
 #include "table/table_builder.h"
 #include "util/string_util.h"
@ -54,384 +56,10 @@ extern const std::string kHashIndexPrefixesBlock;
 extern const std::string kHashIndexPrefixesMetadataBlock;
 typedef BlockBasedTableOptions::IndexType IndexType;
 class IndexBuilder;
 namespace {
 rocksdb::IndexBuilder* CreateIndexBuilder(
    IndexType index_type, const InternalKeyComparator* comparator,
    const SliceTransform* prefix_extractor, int index_block_restart_interval,
    uint64_t index_per_partition);
 }
 // The interface for building index.
 // Instruction for adding a new concrete IndexBuilder:
 //  1. Create a subclass instantiated from IndexBuilder.
 //  2. Add a new entry associated with that subclass in TableOptions::IndexType.
 //  3. Add a create function for the new subclass in CreateIndexBuilder.
 // Note: we can devise more advanced design to simplify the process for adding
 // new subclass, which will, on the other hand, increase the code complexity and
 // catch unwanted attention from readers. Given that we won't add/change
 // indexes frequently, it makes sense to just embrace a more straightforward
 // design that just works.
 class IndexBuilder {
 public:
  // Index builder will construct a set of blocks which contain:
  //  1. One primary index block.
  //  2. (Optional) a set of metablocks that contains the metadata of the
  //     primary index.
  struct IndexBlocks {
    Slice index_block_contents;
    std::unordered_map<std::string, Slice> meta_blocks;
  };
  explicit IndexBuilder(const InternalKeyComparator* comparator)
      : comparator_(comparator) {}
  virtual ~IndexBuilder() {}
  // Add a new index entry to index block.
  // To allow further optimization, we provide `last_key_in_current_block` and
  // `first_key_in_next_block`, based on which the specific implementation can
  // determine the best index key to be used for the index block.
  // @last_key_in_current_block: this parameter maybe overridden with the value
  //                             "substitute key".
  // @first_key_in_next_block: it will be nullptr if the entry being added is
  //                           the last one in the table
  //
  // REQUIRES: Finish() has not yet been called.
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) = 0;
  // This method will be called whenever a key is added. The subclasses may
  // override OnKeyAdded() if they need to collect additional information.
  virtual void OnKeyAdded(const Slice& key) {}
  // Inform the index builder that all entries has been written. Block builder
  // may therefore perform any operation required for block finalization.
  //
  // REQUIRES: Finish() has not yet been called.
  inline Status Finish(IndexBlocks* index_blocks) {
    // Throw away the changes to last_partition_block_handle. It has no effect
    // on the first call to Finish anyway.
    BlockHandle last_partition_block_handle;
    return Finish(index_blocks, last_partition_block_handle);
  }
  // This override of Finish can be utilized to build the 2nd level index in
  // PartitionIndexBuilder.
  //
  // index_blocks will be filled with the resulting index data. If the return
  // value is Status::InComplete() then it means that the index is partitioned
  // and the callee should keep calling Finish until Status::OK() is returned.
  // In that case, last_partition_block_handle is pointer to the block written
  // with the result of the last call to Finish. This can be utilized to build
  // the second level index pointing to each block of partitioned indexes. The
  // last call to Finish() that returns Status::OK() populates index_blocks with
  // the 2nd level index content.
  virtual Status Finish(IndexBlocks* index_blocks,
                        const BlockHandle& last_partition_block_handle) = 0;
  // Get the estimated size for index block.
  virtual size_t EstimatedSize() const = 0;
 protected:
  const InternalKeyComparator* comparator_;
 };
 // This index builder builds space-efficient index block.
 //
 // Optimizations:
 //  1. Made block's `block_restart_interval` to be 1, which will avoid linear
 //     search when doing index lookup (can be disabled by setting
 //     index_block_restart_interval).
 //  2. Shorten the key length for index block. Other than honestly using the
 //     last key in the data block as the index key, we instead find a shortest
 //     substitute key that serves the same function.
 class ShortenedIndexBuilder : public IndexBuilder {
 public:
  explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
                                 int index_block_restart_interval)
      : IndexBuilder(comparator),
        index_block_builder_(index_block_restart_interval) {}
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) override {
    if (first_key_in_next_block != nullptr) {
      comparator_->FindShortestSeparator(last_key_in_current_block,
                                         *first_key_in_next_block);
    } else {
      comparator_->FindShortSuccessor(last_key_in_current_block);
    }
    std::string handle_encoding;
    block_handle.EncodeTo(&handle_encoding);
    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
  }
  virtual Status Finish(
      IndexBlocks* index_blocks,
      const BlockHandle& last_partition_block_handle) override {
    index_blocks->index_block_contents = index_block_builder_.Finish();
    return Status::OK();
  }
  virtual size_t EstimatedSize() const override {
    return index_block_builder_.CurrentSizeEstimate();
  }
 private:
  BlockBuilder index_block_builder_;
 };
 /**
 * IndexBuilder for two-level indexing. Internally it creates a new index for
 * each partition and Finish then in order when Finish is called on it
 * continiously until Status::OK() is returned.
 *
 * The format on the disk would be I I I I I I IP where I is block containing a
 * partition of indexes built using ShortenedIndexBuilder and IP is a block
 * containing a secondary index on the partitions, built using
 * ShortenedIndexBuilder.
 */
 class PartitionIndexBuilder : public IndexBuilder {
 public:
  explicit PartitionIndexBuilder(const InternalKeyComparator* comparator,
                                 const SliceTransform* prefix_extractor,
                                 const uint64_t index_per_partition,
                                 int index_block_restart_interval)
      : IndexBuilder(comparator),
        prefix_extractor_(prefix_extractor),
        index_block_builder_(index_block_restart_interval),
        index_per_partition_(index_per_partition),
        index_block_restart_interval_(index_block_restart_interval) {
    sub_index_builder_ =
        CreateIndexBuilder(sub_type_, comparator_, prefix_extractor_,
                           index_block_restart_interval_, index_per_partition_);
  }
  virtual ~PartitionIndexBuilder() { delete sub_index_builder_; }
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) override {
    sub_index_builder_->AddIndexEntry(last_key_in_current_block,
                                      first_key_in_next_block, block_handle);
    num_indexes++;
    if (UNLIKELY(first_key_in_next_block == nullptr)) {  // no more keys
      entries_.push_back({std::string(*last_key_in_current_block),
                          std::unique_ptr<IndexBuilder>(sub_index_builder_)});
      sub_index_builder_ = nullptr;
    } else if (num_indexes % index_per_partition_ == 0) {
      entries_.push_back({std::string(*last_key_in_current_block),
                          std::unique_ptr<IndexBuilder>(sub_index_builder_)});
      sub_index_builder_ = CreateIndexBuilder(
          sub_type_, comparator_, prefix_extractor_,
          index_block_restart_interval_, index_per_partition_);
    }
  }
  virtual Status Finish(
      IndexBlocks* index_blocks,
      const BlockHandle& last_partition_block_handle) override {
    assert(!entries_.empty());
    // It must be set to null after last key is added
    assert(sub_index_builder_ == nullptr);
    if (finishing == true) {
      Entry& last_entry = entries_.front();
      std::string handle_encoding;
      last_partition_block_handle.EncodeTo(&handle_encoding);
      index_block_builder_.Add(last_entry.key, handle_encoding);
      entries_.pop_front();
    }
    // If there is no sub_index left, then return the 2nd level index.
    if (UNLIKELY(entries_.empty())) {
      index_blocks->index_block_contents = index_block_builder_.Finish();
      return Status::OK();
    } else {
      // Finish the next partition index in line and Incomplete() to indicate we
      // expect more calls to Finish
      Entry& entry = entries_.front();
      auto s = entry.value->Finish(index_blocks);
      finishing = true;
      return s.ok() ? Status::Incomplete() : s;
    }
  }
  virtual size_t EstimatedSize() const override {
    size_t total = 0;
    for (auto it = entries_.begin(); it != entries_.end(); ++it) {
      total += it->value->EstimatedSize();
    }
    total += index_block_builder_.CurrentSizeEstimate();
    total +=
        sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize();
    return total;
  }
 private:
  static const IndexType sub_type_ = BlockBasedTableOptions::kBinarySearch;
  struct Entry {
    std::string key;
    std::unique_ptr<IndexBuilder> value;
  };
  std::list<Entry> entries_;  // list of partitioned indexes and their keys
  const SliceTransform* prefix_extractor_;
  BlockBuilder index_block_builder_;  // top-level index builder
  IndexBuilder* sub_index_builder_;   // the active partition index builder
  uint64_t index_per_partition_;
  int index_block_restart_interval_;
  uint64_t num_indexes = 0;
  bool finishing =
      false;  // true if Finish is called once but not complete yet.
 };
 // HashIndexBuilder contains a binary-searchable primary index and the
 // metadata for secondary hash index construction.
 // The metadata for hash index consists two parts:
 //  - a metablock that compactly contains a sequence of prefixes. All prefixes
 //    are stored consectively without any metadata (like, prefix sizes) being
 //    stored, which is kept in the other metablock.
 //  - a metablock contains the metadata of the prefixes, including prefix size,
 //    restart index and number of block it spans. The format looks like:
 //
 // +-----------------+---------------------------+---------------------+ <=prefix 1
 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
 // +-----------------+---------------------------+---------------------+ <=prefix 2
 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
 // +-----------------+---------------------------+---------------------+
 // |                                                                   |
 // | ....                                                              |
 // |                                                                   |
 // +-----------------+---------------------------+---------------------+ <=prefix n
 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
 // +-----------------+---------------------------+---------------------+
 //
 // The reason of separating these two metablocks is to enable the efficiently
 // reuse the first metablock during hash index construction without unnecessary
 // data copy or small heap allocations for prefixes.
 class HashIndexBuilder : public IndexBuilder {
 public:
  explicit HashIndexBuilder(const InternalKeyComparator* comparator,
                            const SliceTransform* hash_key_extractor,
                            int index_block_restart_interval)
      : IndexBuilder(comparator),
        primary_index_builder_(comparator, index_block_restart_interval),
        hash_key_extractor_(hash_key_extractor) {}
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) override {
    ++current_restart_index_;
    primary_index_builder_.AddIndexEntry(last_key_in_current_block,
                                        first_key_in_next_block, block_handle);
  }
  virtual void OnKeyAdded(const Slice& key) override {
    auto key_prefix = hash_key_extractor_->Transform(key);
    bool is_first_entry = pending_block_num_ == 0;
    // Keys may share the prefix
    if (is_first_entry || pending_entry_prefix_ != key_prefix) {
      if (!is_first_entry) {
        FlushPendingPrefix();
      }
      // need a hard copy otherwise the underlying data changes all the time.
      // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
      // copy.
      pending_entry_prefix_ = key_prefix.ToString();
      pending_block_num_ = 1;
      pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
    } else {
      // entry number increments when keys share the prefix reside in
      // different data blocks.
      auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
      assert(last_restart_index <= current_restart_index_);
      if (last_restart_index != current_restart_index_) {
        ++pending_block_num_;
      }
    }
  }
  virtual Status Finish(
      IndexBlocks* index_blocks,
      const BlockHandle& last_partition_block_handle) override {
    FlushPendingPrefix();
    primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
    index_blocks->meta_blocks.insert(
        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
    index_blocks->meta_blocks.insert(
        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
    return Status::OK();
  }
  virtual size_t EstimatedSize() const override {
    return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
           prefix_meta_block_.size();
  }
 private:
  void FlushPendingPrefix() {
    prefix_block_.append(pending_entry_prefix_.data(),
                         pending_entry_prefix_.size());
    PutVarint32Varint32Varint32(
        &prefix_meta_block_,
        static_cast<uint32_t>(pending_entry_prefix_.size()),
        pending_entry_index_, pending_block_num_);
  }
  ShortenedIndexBuilder primary_index_builder_;
  const SliceTransform* hash_key_extractor_;
  // stores a sequence of prefixes
  std::string prefix_block_;
  // stores the metadata of prefixes
  std::string prefix_meta_block_;
  // The following 3 variables keeps unflushed prefix and its metadata.
  // The details of block_num and entry_index can be found in
  // "block_hash_index.{h,cc}"
  uint32_t pending_block_num_ = 0;
  uint32_t pending_entry_index_ = 0;
  std::string pending_entry_prefix_;
  uint64_t current_restart_index_ = 0;
 };
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace {
 // Create a index builder based on its type.
 IndexBuilder* CreateIndexBuilder(IndexType index_type,
                                 const InternalKeyComparator* comparator,
                                 const SliceTransform* prefix_extractor,
                                 int index_block_restart_interval,
                                 uint64_t index_per_partition) {
  switch (index_type) {
    case BlockBasedTableOptions::kBinarySearch: {
      return new ShortenedIndexBuilder(comparator,
                                       index_block_restart_interval);
    }
    case BlockBasedTableOptions::kHashSearch: {
      return new HashIndexBuilder(comparator, prefix_extractor,
                                  index_block_restart_interval);
    }
    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
      return new PartitionIndexBuilder(comparator, prefix_extractor,
                                       index_per_partition,
                                       index_block_restart_interval);
    }
    default: {
      assert(!"Do not recognize the index type ");
      return nullptr;
    }
  }
  // impossible.
  assert(false);
  return nullptr;
 }
 // Create a index builder based on its type.
 FilterBlockBuilder* CreateFilterBlockBuilder(const ImmutableCFOptions& opt,
    const BlockBasedTableOptions& table_opt) {
@ -649,11 +277,11 @@ struct BlockBasedTableBuilder::Rep {
                   table_options.use_delta_encoding),
        range_del_block(1),  // TODO(andrewkr): restart_interval unnecessary
        internal_prefix_transform(_ioptions.prefix_extractor),
-        index_builder(
+        index_builder(IndexBuilder::CreateIndexBuilder(
-            CreateIndexBuilder(table_options.index_type, &internal_comparator,
+            table_options.index_type, &internal_comparator,
-                               &this->internal_prefix_transform,
+            &this->internal_prefix_transform,
-                               table_options.index_block_restart_interval,
+            table_options.index_block_restart_interval,
-                               table_options.index_per_partition)),
+            table_options.index_per_partition)),
        compression_type(_compression_type),
        compression_opts(_compression_opts),
        compression_dict(_compression_dict),
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -57,6 +57,11 @@ using std::unique_ptr;
 typedef BlockBasedTable::IndexReader IndexReader;
 BlockBasedTable::~BlockBasedTable() {
  Close();
  delete rep_;
 }
 namespace {
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
@ -143,42 +148,6 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
 }  // namespace
 // -- IndexReader and its subclasses
 // IndexReader is the interface that provide the functionality for index access.
 class BlockBasedTable::IndexReader {
 public:
  explicit IndexReader(const Comparator* comparator, Statistics* stats)
      : comparator_(comparator), statistics_(stats) {}
  virtual ~IndexReader() {}
  // Create an iterator for index access.
  // If iter is null then a new object is created on heap and the callee will
  // have the ownership. If a non-null iter is passed in it will be used, and
  // the returned value is either the same as iter or a new on-heap object that
  // wrapps the passed iter. In the latter case the return value would point to
  // a different object then iter and the callee has the ownership of the
  // returned object.
  virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
                                        bool total_order_seek = true) = 0;
  // The size of the index.
  virtual size_t size() const = 0;
  // Memory usage of the index block
  virtual size_t usable_size() const = 0;
  // return the statistics pointer
  virtual Statistics* statistics() const { return statistics_; }
  // Report an approximation of how much memory has been used other than memory
  // that was allocated in block cache.
  virtual size_t ApproximateMemoryUsage() const = 0;
 protected:
  const Comparator* comparator_;
 private:
  Statistics* statistics_;
 };
 // Index that allows binary search lookup in a two-level index structure.
 class PartitionIndexReader : public IndexReader {
 public:
@ -397,118 +366,6 @@ class HashIndexReader : public IndexReader {
  BlockContents prefixes_contents_;
 };
 // CachableEntry represents the entries that *may* be fetched from block cache.
 //  field `value` is the item we want to get.
 //  field `cache_handle` is the cache handle to the block cache. If the value
 //    was not read from cache, `cache_handle` will be nullptr.
 template <class TValue>
 struct BlockBasedTable::CachableEntry {
  CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
      : value(_value), cache_handle(_cache_handle) {}
  CachableEntry() : CachableEntry(nullptr, nullptr) {}
  void Release(Cache* cache) {
    if (cache_handle) {
      cache->Release(cache_handle);
      value = nullptr;
      cache_handle = nullptr;
    }
  }
  bool IsSet() const { return cache_handle != nullptr; }
  TValue* value = nullptr;
  // if the entry is from the cache, cache_handle will be populated.
  Cache::Handle* cache_handle = nullptr;
 };
 struct BlockBasedTable::Rep {
  Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
      const BlockBasedTableOptions& _table_opt,
      const InternalKeyComparator& _internal_comparator, bool skip_filters)
      : ioptions(_ioptions),
        env_options(_env_options),
        table_options(_table_opt),
        filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
        internal_comparator(_internal_comparator),
        filter_type(FilterType::kNoFilter),
        whole_key_filtering(_table_opt.whole_key_filtering),
        prefix_filtering(true),
        range_del_handle(BlockHandle::NullBlockHandle()),
        global_seqno(kDisableGlobalSequenceNumber) {}
  const ImmutableCFOptions& ioptions;
  const EnvOptions& env_options;
  const BlockBasedTableOptions& table_options;
  const FilterPolicy* const filter_policy;
  const InternalKeyComparator& internal_comparator;
  Status status;
  unique_ptr<RandomAccessFileReader> file;
  char cache_key_prefix[kMaxCacheKeyPrefixSize];
  size_t cache_key_prefix_size = 0;
  char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
  size_t persistent_cache_key_prefix_size = 0;
  char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
  size_t compressed_cache_key_prefix_size = 0;
  uint64_t dummy_index_reader_offset =
      0;  // ID that is unique for the block cache.
  PersistentCacheOptions persistent_cache_options;
  // Footer contains the fixed table information
  Footer footer;
  // index_reader and filter will be populated and used only when
  // options.block_cache is nullptr; otherwise we will get the index block via
  // the block cache.
  unique_ptr<IndexReader> index_reader;
  unique_ptr<FilterBlockReader> filter;
  enum class FilterType {
    kNoFilter,
    kFullFilter,
    kBlockFilter,
  };
  FilterType filter_type;
  BlockHandle filter_handle;
  std::shared_ptr<const TableProperties> table_properties;
  // Block containing the data for the compression dictionary. We take ownership
  // for the entire block struct, even though we only use its Slice member. This
  // is easier because the Slice member depends on the continued existence of
  // another member ("allocation").
  std::unique_ptr<const BlockContents> compression_dict_block;
  BlockBasedTableOptions::IndexType index_type;
  bool hash_index_allow_collision;
  bool whole_key_filtering;
  bool prefix_filtering;
  // TODO(kailiu) It is very ugly to use internal key in table, since table
  // module should not be relying on db module. However to make things easier
  // and compatible with existing code, we introduce a wrapper that allows
  // block to extract prefix without knowing if a key is internal or not.
  unique_ptr<SliceTransform> internal_prefix_transform;
  // only used in level 0 files:
  // when pin_l0_filter_and_index_blocks_in_cache is true, we do use the
  // LRU cache, but we always keep the filter & idndex block's handle checked
  // out here (=we don't call Release()), plus the parsed out objects
  // the LRU cache will never push flush them out, hence they're pinned
  CachableEntry<FilterBlockReader> filter_entry;
  CachableEntry<IndexReader> index_entry;
  // range deletion meta-block is pinned through reader's lifetime when LRU
  // cache is enabled.
  CachableEntry<Block> range_del_entry;
  BlockHandle range_del_handle;
  // If global_seqno is used, all Keys in this file will have the same
  // seqno with value `global_seqno`.
  //
  // A value of kDisableGlobalSequenceNumber means that this feature is disabled
  // and every key have it's own seqno.
  SequenceNumber global_seqno;
 };
 BlockBasedTable::~BlockBasedTable() {
  Close();
  delete rep_;
 }
 // Helper function to setup the cache key's prefix for the Table.
 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) {
  assert(kMaxCacheKeyPrefixSize >= 10);
@ -850,7 +707,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
      // Set filter block
      if (rep->filter_policy) {
-        rep->filter.reset(ReadFilter(rep));
+        rep->filter.reset(new_table->ReadFilter(rep));
      }
    } else {
      delete index_reader;
@ -1087,7 +944,7 @@ Status BlockBasedTable::PutDataBlockToCache(
  return s;
 }
-FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) {
+FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) const {
  // TODO: We might want to unify with ReadBlockFromFile() if we start
  // requiring checksum verification in Table::Open.
  if (rep->filter_type == Rep::FilterType::kNoFilter) {
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@ -20,6 +20,9 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/persistent_cache_helper.h"
 #include "table/table_properties_internal.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
@ -139,8 +142,45 @@ class BlockBasedTable : public TableReader {
  bool TEST_filter_block_preloaded() const;
  bool TEST_index_reader_preloaded() const;
-  // Implementation of IndexReader will be exposed to internal cc file only.
+
-  class IndexReader;
+  // IndexReader is the interface that provide the functionality for index
  // access.
  class IndexReader {
   public:
    explicit IndexReader(const Comparator* comparator, Statistics* stats)
        : comparator_(comparator), statistics_(stats) {}
    virtual ~IndexReader() {}
    // Create an iterator for index access.
    // If iter is null then a new object is created on heap and the callee will
    // have the ownership. If a non-null iter is passed in it will be used, and
    // the returned value is either the same as iter or a new on-heap object
    // that
    // wrapps the passed iter. In the latter case the return value would point
    // to
    // a different object then iter and the callee has the ownership of the
    // returned object.
    virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
                                          bool total_order_seek = true) = 0;
    // The size of the index.
    virtual size_t size() const = 0;
    // Memory usage of the index block
    virtual size_t usable_size() const = 0;
    // return the statistics pointer
    virtual Statistics* statistics() const { return statistics_; }
    // Report an approximation of how much memory has been used other than
    // memory
    // that was allocated in block cache.
    virtual size_t ApproximateMemoryUsage() const = 0;
   protected:
    const Comparator* comparator_;
   private:
    Statistics* statistics_;
  };
  static Slice GetCacheKey(const char* cache_key_prefix,
                           size_t cache_key_prefix_size,
@ -155,7 +195,6 @@ class BlockBasedTable : public TableReader {
 private:
  template <class TValue>
  struct CachableEntry;
  struct Rep;
  Rep* rep_;
  bool compaction_optimized_;
@ -251,7 +290,7 @@ class BlockBasedTable : public TableReader {
                              std::unique_ptr<InternalIterator>* iter);
  // Create the filter from the filter block.
-  static FilterBlockReader* ReadFilter(Rep* rep);
+  FilterBlockReader* ReadFilter(Rep* rep) const;
  static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
@ -290,4 +329,112 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
  bool skip_filters_;
 };
 // CachableEntry represents the entries that *may* be fetched from block cache.
 //  field `value` is the item we want to get.
 //  field `cache_handle` is the cache handle to the block cache. If the value
 //    was not read from cache, `cache_handle` will be nullptr.
 template <class TValue>
 struct BlockBasedTable::CachableEntry {
  CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
      : value(_value), cache_handle(_cache_handle) {}
  CachableEntry() : CachableEntry(nullptr, nullptr) {}
  void Release(Cache* cache) {
    if (cache_handle) {
      cache->Release(cache_handle);
      value = nullptr;
      cache_handle = nullptr;
    }
  }
  bool IsSet() const { return cache_handle != nullptr; }
  TValue* value = nullptr;
  // if the entry is from the cache, cache_handle will be populated.
  Cache::Handle* cache_handle = nullptr;
 };
 struct BlockBasedTable::Rep {
  Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
      const BlockBasedTableOptions& _table_opt,
      const InternalKeyComparator& _internal_comparator, bool skip_filters)
      : ioptions(_ioptions),
        env_options(_env_options),
        table_options(_table_opt),
        filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
        internal_comparator(_internal_comparator),
        filter_type(FilterType::kNoFilter),
        whole_key_filtering(_table_opt.whole_key_filtering),
        prefix_filtering(true),
        range_del_handle(BlockHandle::NullBlockHandle()),
        global_seqno(kDisableGlobalSequenceNumber) {}
  const ImmutableCFOptions& ioptions;
  const EnvOptions& env_options;
  const BlockBasedTableOptions& table_options;
  const FilterPolicy* const filter_policy;
  const InternalKeyComparator& internal_comparator;
  Status status;
  unique_ptr<RandomAccessFileReader> file;
  char cache_key_prefix[kMaxCacheKeyPrefixSize];
  size_t cache_key_prefix_size = 0;
  char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
  size_t persistent_cache_key_prefix_size = 0;
  char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
  size_t compressed_cache_key_prefix_size = 0;
  uint64_t dummy_index_reader_offset =
      0;  // ID that is unique for the block cache.
  PersistentCacheOptions persistent_cache_options;
  // Footer contains the fixed table information
  Footer footer;
  // index_reader and filter will be populated and used only when
  // options.block_cache is nullptr; otherwise we will get the index block via
  // the block cache.
  unique_ptr<IndexReader> index_reader;
  unique_ptr<FilterBlockReader> filter;
  enum class FilterType {
    kNoFilter,
    kFullFilter,
    kBlockFilter,
    kPartitionedFilter,
  };
  FilterType filter_type;
  BlockHandle filter_handle;
  std::shared_ptr<const TableProperties> table_properties;
  // Block containing the data for the compression dictionary. We take ownership
  // for the entire block struct, even though we only use its Slice member. This
  // is easier because the Slice member depends on the continued existence of
  // another member ("allocation").
  std::unique_ptr<const BlockContents> compression_dict_block;
  BlockBasedTableOptions::IndexType index_type;
  bool hash_index_allow_collision;
  bool whole_key_filtering;
  bool prefix_filtering;
  // TODO(kailiu) It is very ugly to use internal key in table, since table
  // module should not be relying on db module. However to make things easier
  // and compatible with existing code, we introduce a wrapper that allows
  // block to extract prefix without knowing if a key is internal or not.
  unique_ptr<SliceTransform> internal_prefix_transform;
  // only used in level 0 files:
  // when pin_l0_filter_and_index_blocks_in_cache is true, we do use the
  // LRU cache, but we always keep the filter & idndex block's handle checked
  // out here (=we don't call Release()), plus the parsed out objects
  // the LRU cache will never push flush them out, hence they're pinned
  CachableEntry<FilterBlockReader> filter_entry;
  CachableEntry<IndexReader> index_entry;
  // range deletion meta-block is pinned through reader's lifetime when LRU
  // cache is enabled.
  CachableEntry<Block> range_del_entry;
  BlockHandle range_del_handle;
  // If global_seqno is used, all Keys in this file will have the same
  // seqno with value `global_seqno`.
  //
  // A value of kDisableGlobalSequenceNumber means that this feature is disabled
  // and every key have it's own seqno.
  SequenceNumber global_seqno;
 };
 }  // namespace rocksdb
--- a/table/format.h
+++ b/table/format.h
@ -15,8 +15,8 @@
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "port/port.h" // noexcept
+#include "port/port.h"  // noexcept
-#include "table/persistent_cache_helper.h"
+#include "table/persistent_cache_options.h"
 #include "util/cf_options.h"
 namespace rocksdb {
--- a/table/index_builder.cc
+++ b/table/index_builder.cc
@ -0,0 +1,52 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "table/index_builder.h"
 #include <assert.h>
 #include <inttypes.h>
 #include <list>
 #include <string>
 #include "rocksdb/comparator.h"
 #include "table/format.h"
 #include "table/partitioned_filter_block.h"
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace rocksdb {
 // using namespace rocksdb;
 // Create a index builder based on its type.
 IndexBuilder* IndexBuilder::CreateIndexBuilder(
    BlockBasedTableOptions::IndexType index_type,
    const InternalKeyComparator* comparator,
    const SliceTransform* prefix_extractor, int index_block_restart_interval,
    uint64_t index_per_partition) {
  switch (index_type) {
    case BlockBasedTableOptions::kBinarySearch: {
      return new ShortenedIndexBuilder(comparator,
                                       index_block_restart_interval);
    }
    case BlockBasedTableOptions::kHashSearch: {
      return new HashIndexBuilder(comparator, prefix_extractor,
                                  index_block_restart_interval);
    }
    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
      return new PartitionIndexBuilder(comparator, prefix_extractor,
                                       index_per_partition,
                                       index_block_restart_interval);
    }
    default: {
      assert(!"Do not recognize the index type ");
      return nullptr;
    }
  }
  // impossible.
  assert(false);
  return nullptr;
 }
 }  // namespace rocksdb
--- a/table/index_builder.h
+++ b/table/index_builder.h
@ -0,0 +1,265 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 #include <assert.h>
 #include <inttypes.h>
 #include <string>
 #include <unordered_map>
 #include "rocksdb/comparator.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_builder.h"
 #include "table/format.h"
 namespace rocksdb {
 // The interface for building index.
 // Instruction for adding a new concrete IndexBuilder:
 //  1. Create a subclass instantiated from IndexBuilder.
 //  2. Add a new entry associated with that subclass in TableOptions::IndexType.
 //  3. Add a create function for the new subclass in CreateIndexBuilder.
 // Note: we can devise more advanced design to simplify the process for adding
 // new subclass, which will, on the other hand, increase the code complexity and
 // catch unwanted attention from readers. Given that we won't add/change
 // indexes frequently, it makes sense to just embrace a more straightforward
 // design that just works.
 class IndexBuilder {
 public:
  static IndexBuilder* CreateIndexBuilder(
      BlockBasedTableOptions::IndexType index_type,
      const InternalKeyComparator* comparator,
      const SliceTransform* prefix_extractor, int index_block_restart_interval,
      uint64_t index_per_partition);
  // Index builder will construct a set of blocks which contain:
  //  1. One primary index block.
  //  2. (Optional) a set of metablocks that contains the metadata of the
  //     primary index.
  struct IndexBlocks {
    Slice index_block_contents;
    std::unordered_map<std::string, Slice> meta_blocks;
  };
  explicit IndexBuilder(const InternalKeyComparator* comparator)
      : comparator_(comparator) {}
  virtual ~IndexBuilder() {}
  // Add a new index entry to index block.
  // To allow further optimization, we provide `last_key_in_current_block` and
  // `first_key_in_next_block`, based on which the specific implementation can
  // determine the best index key to be used for the index block.
  // @last_key_in_current_block: this parameter maybe overridden with the value
  //                             "substitute key".
  // @first_key_in_next_block: it will be nullptr if the entry being added is
  //                           the last one in the table
  //
  // REQUIRES: Finish() has not yet been called.
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) = 0;
  // This method will be called whenever a key is added. The subclasses may
  // override OnKeyAdded() if they need to collect additional information.
  virtual void OnKeyAdded(const Slice& key) {}
  // Inform the index builder that all entries has been written. Block builder
  // may therefore perform any operation required for block finalization.
  //
  // REQUIRES: Finish() has not yet been called.
  inline Status Finish(IndexBlocks* index_blocks) {
    // Throw away the changes to last_partition_block_handle. It has no effect
    // on the first call to Finish anyway.
    BlockHandle last_partition_block_handle;
    return Finish(index_blocks, last_partition_block_handle);
  }
  // This override of Finish can be utilized to build the 2nd level index in
  // PartitionIndexBuilder.
  //
  // index_blocks will be filled with the resulting index data. If the return
  // value is Status::InComplete() then it means that the index is partitioned
  // and the callee should keep calling Finish until Status::OK() is returned.
  // In that case, last_partition_block_handle is pointer to the block written
  // with the result of the last call to Finish. This can be utilized to build
  // the second level index pointing to each block of partitioned indexes. The
  // last call to Finish() that returns Status::OK() populates index_blocks with
  // the 2nd level index content.
  virtual Status Finish(IndexBlocks* index_blocks,
                        const BlockHandle& last_partition_block_handle) = 0;
  // Get the estimated size for index block.
  virtual size_t EstimatedSize() const = 0;
 protected:
  const InternalKeyComparator* comparator_;
 };
 // This index builder builds space-efficient index block.
 //
 // Optimizations:
 //  1. Made block's `block_restart_interval` to be 1, which will avoid linear
 //     search when doing index lookup (can be disabled by setting
 //     index_block_restart_interval).
 //  2. Shorten the key length for index block. Other than honestly using the
 //     last key in the data block as the index key, we instead find a shortest
 //     substitute key that serves the same function.
 class ShortenedIndexBuilder : public IndexBuilder {
 public:
  explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
                                 int index_block_restart_interval)
      : IndexBuilder(comparator),
        index_block_builder_(index_block_restart_interval) {}
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) override {
    if (first_key_in_next_block != nullptr) {
      comparator_->FindShortestSeparator(last_key_in_current_block,
                                         *first_key_in_next_block);
    } else {
      comparator_->FindShortSuccessor(last_key_in_current_block);
    }
    std::string handle_encoding;
    block_handle.EncodeTo(&handle_encoding);
    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
  }
  virtual Status Finish(
      IndexBlocks* index_blocks,
      const BlockHandle& last_partition_block_handle) override {
    index_blocks->index_block_contents = index_block_builder_.Finish();
    return Status::OK();
  }
  virtual size_t EstimatedSize() const override {
    return index_block_builder_.CurrentSizeEstimate();
  }
 private:
  BlockBuilder index_block_builder_;
 };
 // HashIndexBuilder contains a binary-searchable primary index and the
 // metadata for secondary hash index construction.
 // The metadata for hash index consists two parts:
 //  - a metablock that compactly contains a sequence of prefixes. All prefixes
 //    are stored consectively without any metadata (like, prefix sizes) being
 //    stored, which is kept in the other metablock.
 //  - a metablock contains the metadata of the prefixes, including prefix size,
 //    restart index and number of block it spans. The format looks like:
 //
 // +-----------------+---------------------------+---------------------+
 // <=prefix 1
 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
 // +-----------------+---------------------------+---------------------+
 // <=prefix 2
 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
 // +-----------------+---------------------------+---------------------+
 // |                                                                   |
 // | ....                                                              |
 // |                                                                   |
 // +-----------------+---------------------------+---------------------+
 // <=prefix n
 // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
 // +-----------------+---------------------------+---------------------+
 //
 // The reason of separating these two metablocks is to enable the efficiently
 // reuse the first metablock during hash index construction without unnecessary
 // data copy or small heap allocations for prefixes.
 class HashIndexBuilder : public IndexBuilder {
 public:
  explicit HashIndexBuilder(const InternalKeyComparator* comparator,
                            const SliceTransform* hash_key_extractor,
                            int index_block_restart_interval)
      : IndexBuilder(comparator),
        primary_index_builder_(comparator, index_block_restart_interval),
        hash_key_extractor_(hash_key_extractor) {}
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) override {
    ++current_restart_index_;
    primary_index_builder_.AddIndexEntry(last_key_in_current_block,
                                         first_key_in_next_block, block_handle);
  }
  virtual void OnKeyAdded(const Slice& key) override {
    auto key_prefix = hash_key_extractor_->Transform(key);
    bool is_first_entry = pending_block_num_ == 0;
    // Keys may share the prefix
    if (is_first_entry || pending_entry_prefix_ != key_prefix) {
      if (!is_first_entry) {
        FlushPendingPrefix();
      }
      // need a hard copy otherwise the underlying data changes all the time.
      // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
      // copy.
      pending_entry_prefix_ = key_prefix.ToString();
      pending_block_num_ = 1;
      pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
    } else {
      // entry number increments when keys share the prefix reside in
      // different data blocks.
      auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
      assert(last_restart_index <= current_restart_index_);
      if (last_restart_index != current_restart_index_) {
        ++pending_block_num_;
      }
    }
  }
  virtual Status Finish(
      IndexBlocks* index_blocks,
      const BlockHandle& last_partition_block_handle) override {
    FlushPendingPrefix();
    primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
    index_blocks->meta_blocks.insert(
        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
    index_blocks->meta_blocks.insert(
        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
    return Status::OK();
  }
  virtual size_t EstimatedSize() const override {
    return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
           prefix_meta_block_.size();
  }
 private:
  void FlushPendingPrefix() {
    prefix_block_.append(pending_entry_prefix_.data(),
                         pending_entry_prefix_.size());
    PutVarint32Varint32Varint32(
        &prefix_meta_block_,
        static_cast<uint32_t>(pending_entry_prefix_.size()),
        pending_entry_index_, pending_block_num_);
  }
  ShortenedIndexBuilder primary_index_builder_;
  const SliceTransform* hash_key_extractor_;
  // stores a sequence of prefixes
  std::string prefix_block_;
  // stores the metadata of prefixes
  std::string prefix_meta_block_;
  // The following 3 variables keeps unflushed prefix and its metadata.
  // The details of block_num and entry_index can be found in
  // "block_hash_index.{h,cc}"
  uint32_t pending_block_num_ = 0;
  uint32_t pending_entry_index_ = 0;
  std::string pending_entry_prefix_;
  uint64_t current_restart_index_ = 0;
 };
 }  // namespace rocksdb
--- a/table/partitioned_filter_block.cc
+++ b/table/partitioned_filter_block.cc
@ -0,0 +1,84 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #include "table/partitioned_filter_block.h"
 #include "port/port.h"
 #include "util/coding.h"
 namespace rocksdb {
 PartitionIndexBuilder::PartitionIndexBuilder(
    const InternalKeyComparator* comparator,
    const SliceTransform* prefix_extractor, const uint64_t index_per_partition,
    int index_block_restart_interval)
    : IndexBuilder(comparator),
      prefix_extractor_(prefix_extractor),
      index_block_builder_(index_block_restart_interval),
      index_per_partition_(index_per_partition),
      index_block_restart_interval_(index_block_restart_interval) {
  sub_index_builder_ =
      CreateIndexBuilder(sub_type_, comparator_, prefix_extractor_,
                         index_block_restart_interval_, index_per_partition_);
 }
 PartitionIndexBuilder::~PartitionIndexBuilder() { delete sub_index_builder_; }
 void PartitionIndexBuilder::AddIndexEntry(
    std::string* last_key_in_current_block,
    const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
  sub_index_builder_->AddIndexEntry(last_key_in_current_block,
                                    first_key_in_next_block, block_handle);
  num_indexes++;
  if (UNLIKELY(first_key_in_next_block == nullptr)) {  // no more keys
    entries_.push_back({std::string(*last_key_in_current_block),
                        std::unique_ptr<IndexBuilder>(sub_index_builder_)});
    sub_index_builder_ = nullptr;
  } else if (num_indexes % index_per_partition_ == 0) {
    entries_.push_back({std::string(*last_key_in_current_block),
                        std::unique_ptr<IndexBuilder>(sub_index_builder_)});
    sub_index_builder_ =
        CreateIndexBuilder(sub_type_, comparator_, prefix_extractor_,
                           index_block_restart_interval_, index_per_partition_);
  }
 }
 Status PartitionIndexBuilder::Finish(
    IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
  assert(!entries_.empty());
  // It must be set to null after last key is added
  assert(sub_index_builder_ == nullptr);
  if (finishing == true) {
    Entry& last_entry = entries_.front();
    std::string handle_encoding;
    last_partition_block_handle.EncodeTo(&handle_encoding);
    index_block_builder_.Add(last_entry.key, handle_encoding);
    entries_.pop_front();
  }
  // If there is no sub_index left, then return the 2nd level index.
  if (UNLIKELY(entries_.empty())) {
    index_blocks->index_block_contents = index_block_builder_.Finish();
    return Status::OK();
  } else {
    // Finish the next partition index in line and Incomplete() to indicate we
    // expect more calls to Finish
    Entry& entry = entries_.front();
    auto s = entry.value->Finish(index_blocks);
    finishing = true;
    return s.ok() ? Status::Incomplete() : s;
  }
 }
 size_t PartitionIndexBuilder::EstimatedSize() const {
  size_t total = 0;
  for (auto it = entries_.begin(); it != entries_.end(); ++it) {
    total += it->value->EstimatedSize();
  }
  total += index_block_builder_.CurrentSizeEstimate();
  total +=
      sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize();
  return total;
 }
 }  // namespace rocksdb
--- a/table/partitioned_filter_block.h
+++ b/table/partitioned_filter_block.h
@ -0,0 +1,65 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 #include <list>
 #include <string>
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "util/hash.h"
 #include "table/index_builder.h"
 namespace rocksdb {
 /**
 * IndexBuilder for two-level indexing. Internally it creates a new index for
 * each partition and Finish then in order when Finish is called on it
 * continiously until Status::OK() is returned.
 *
 * The format on the disk would be I I I I I I IP where I is block containing a
 * partition of indexes built using ShortenedIndexBuilder and IP is a block
 * containing a secondary index on the partitions, built using
 * ShortenedIndexBuilder.
 */
 class PartitionIndexBuilder : public IndexBuilder {
 public:
  explicit PartitionIndexBuilder(const InternalKeyComparator* comparator,
                                 const SliceTransform* prefix_extractor,
                                 const uint64_t index_per_partition,
                                 int index_block_restart_interval);
  virtual ~PartitionIndexBuilder();
  virtual void AddIndexEntry(std::string* last_key_in_current_block,
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle);
  virtual Status Finish(IndexBlocks* index_blocks,
                        const BlockHandle& last_partition_block_handle);
  virtual size_t EstimatedSize() const;
 private:
  static const BlockBasedTableOptions::IndexType sub_type_ =
      BlockBasedTableOptions::kBinarySearch;
  struct Entry {
    std::string key;
    std::unique_ptr<IndexBuilder> value;
  };
  std::list<Entry> entries_;  // list of partitioned indexes and their keys
  const SliceTransform* prefix_extractor_;
  BlockBuilder index_block_builder_;  // top-level index builder
  IndexBuilder* sub_index_builder_;   // the active partition index builder
  uint64_t index_per_partition_;
  int index_block_restart_interval_;
  uint64_t num_indexes = 0;
  bool finishing =
      false;  // true if Finish is called once but not complete yet.
 };
 }  // namespace rocksdb
--- a/table/persistent_cache_helper.cc
+++ b/table/persistent_cache_helper.cc
@ -2,7 +2,9 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #include "table/persistent_cache_helper.h"
 #include "table/block_based_table_reader.h"
 #include "table/format.h"
 namespace rocksdb {
--- a/table/persistent_cache_helper.h
+++ b/table/persistent_cache_helper.h
@ -6,33 +6,14 @@
 #include <string>
-#include "table/block_based_table_reader.h"
+#include "table/format.h"
 #include "table/persistent_cache_options.h"
 #include "util/statistics.h"
 namespace rocksdb {
 struct BlockContents;
 // PersistentCacheOptions
 //
 // This describe the caching behavior for page cache
 // This is used to pass the context for caching and the cache handle
 struct PersistentCacheOptions {
  PersistentCacheOptions() {}
  explicit PersistentCacheOptions(
      const std::shared_ptr<PersistentCache>& _persistent_cache,
      const std::string _key_prefix, Statistics* const _statistics)
      : persistent_cache(_persistent_cache),
        key_prefix(_key_prefix),
        statistics(_statistics) {}
  virtual ~PersistentCacheOptions() {}
  std::shared_ptr<PersistentCache> persistent_cache;
  std::string key_prefix;
  Statistics* statistics = nullptr;
 };
 // PersistentCacheHelper
 //
 // Encapsulates  some of the helper logic for read and writing from the cache
--- a/table/persistent_cache_options.h
+++ b/table/persistent_cache_options.h
@ -0,0 +1,34 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 #include <string>
 #include "include/rocksdb/persistent_cache.h"
 #include "util/statistics.h"
 namespace rocksdb {
 // PersistentCacheOptions
 //
 // This describe the caching behavior for page cache
 // This is used to pass the context for caching and the cache handle
 struct PersistentCacheOptions {
  PersistentCacheOptions() {}
  explicit PersistentCacheOptions(
      const std::shared_ptr<PersistentCache>& _persistent_cache,
      const std::string _key_prefix, Statistics* const _statistics)
      : persistent_cache(_persistent_cache),
        key_prefix(_key_prefix),
        statistics(_statistics) {}
  virtual ~PersistentCacheOptions() {}
  std::shared_ptr<PersistentCache> persistent_cache;
  std::string key_prefix;
  Statistics* statistics = nullptr;
 };
 }  // namespace rocksdb