From 37b459f0aaaaae238d599d1e84f9003570683beb Mon Sep 17 00:00:00 2001
From: Haobo Xu <haobo@fb.com>
Date: Wed, 20 Nov 2013 14:30:36 -0800
Subject: [PATCH 01/70] [RocksDB] Test diff on performance branch

Summary: trivia comment change

Test Plan: Go through the step ofs developing under the performance branch

Reviewers: dhruba, kailiu, sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14259
---
 db/table_cache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/table_cache.h b/db/table_cache.h
index 4b225af9b..f65326bad 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -55,7 +55,7 @@ class TableCache {
              void (*mark_key_may_exist)(void*) = nullptr);
 
   // Determine whether the table may contain the specified prefix.  If
-  // the table index of blooms are not in memory, this may cause an I/O
+  // the table index or blooms are not in memory, this may cause an I/O
   bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
                       uint64_t file_size, const Slice& internal_prefix,
                       bool* table_io);

From 15b31b57dff86f4374bc611ccf6be2c14e15a13b Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 18 Nov 2013 15:39:42 -0800
Subject: [PATCH 02/70] MergingIterator.Seek() to lazily initialize MinHeap

Summary:
For the use cases that prefix filtering is enabled, initializing heaps when doing MergingIterator.Seek() might introduce non-negligible costs. This patch makes it lazily done.

Test Plan: make all check

Reviewers: haobo,dhruba,kailiu

CC:

Task ID: #

Blame Rev:
---
 table/merger.cc | 48 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/table/merger.cc b/table/merger.cc
index f5ce7440c..89faf402b 100644
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -26,6 +26,7 @@ class MergingIterator : public Iterator {
       : comparator_(comparator),
         children_(n),
         current_(nullptr),
+        use_heap_(true),
         direction_(kForward),
         maxHeap_(NewMaxIterHeap(comparator_)),
         minHeap_ (NewMinIterHeap(comparator_)) {
@@ -70,14 +71,38 @@ class MergingIterator : public Iterator {
   }
 
   virtual void Seek(const Slice& target) {
-    ClearHeaps();
+    // Invalidate the heap.
+    use_heap_ = false;
+    IteratorWrapper* first_child = nullptr;
     for (auto& child : children_) {
       child.Seek(target);
       if (child.Valid()) {
-        minHeap_.push(&child);
+        // This child has valid key
+        if (!use_heap_) {
+          if (first_child == nullptr) {
+            // It's the first child has valid key. Only put it int
+            // current_. Now the values in the heap should be invalid.
+            first_child = &child;
+          } else {
+            // We have more than one children with valid keys. Initialize
+            // the heap and put the first child into the heap.
+            ClearHeaps();
+            minHeap_.push(first_child);
+          }
+        }
+        if (use_heap_) {
+          minHeap_.push(&child);
+        }
       }
     }
-    FindSmallest();
+    if (use_heap_) {
+      // If heap is valid, need to put the smallest key to curent_.
+      FindSmallest();
+    } else {
+      // The heap is not valid, then the current_ iterator is the first
+      // one, or null if there is no first child.
+      current_ = first_child;
+    }
     direction_ = kForward;
   }
 
@@ -109,10 +134,14 @@ class MergingIterator : public Iterator {
     // as the current points to the current record. move the iterator forward.
     // and if it is valid add it to the heap.
     current_->Next();
-    if (current_->Valid()){
-      minHeap_.push(current_);
+    if (use_heap_) {
+      if (current_->Valid()) {
+        minHeap_.push(current_);
+      }
+      FindSmallest();
+    } else if (!current_->Valid()) {
+      current_ = nullptr;
     }
-    FindSmallest();
   }
 
   virtual void Prev() {
@@ -178,6 +207,10 @@ class MergingIterator : public Iterator {
   const Comparator* comparator_;
   std::vector<IteratorWrapper> children_;
   IteratorWrapper* current_;
+  // If the value is true, both of iterators in the heap and current_
+  // contain valid rows. If it is false, only current_ can possibly contain
+  // valid rows.
+  bool use_heap_;
   // Which direction is the iterator moving?
   enum Direction {
     kForward,
@@ -189,6 +222,7 @@ class MergingIterator : public Iterator {
 };
 
 void MergingIterator::FindSmallest() {
+  assert(use_heap_);
   if (minHeap_.empty()) {
     current_ = nullptr;
   } else {
@@ -199,6 +233,7 @@ void MergingIterator::FindSmallest() {
 }
 
 void MergingIterator::FindLargest() {
+  assert(use_heap_);
   if (maxHeap_.empty()) {
     current_ = nullptr;
   } else {
@@ -209,6 +244,7 @@ void MergingIterator::FindLargest() {
 }
 
 void MergingIterator::ClearHeaps() {
+  use_heap_ = true;
   maxHeap_ = NewMaxIterHeap(comparator_);
   minHeap_ = NewMinIterHeap(comparator_);
 }

From 071fb0d77b59c4f8cb05e363a8bfd4973db5a9bc Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 19 Nov 2013 16:37:34 -0800
Subject: [PATCH 03/70] Inline a couple of functions and put one save lazily
 clearing

Summary:
Machine several functions inline.
Also, in DBIter.Seek() make value cleaning up lazily done.
These are for the use case that Seek() are called lots of times but few return values.

Test Plan: make all check

Differential Revision: https://reviews.facebook.net/D14217
---
 db/db_iter.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index 4e3c52c6e..7a395db7a 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -426,13 +426,13 @@ void DBIter::FindPrevUserEntry() {
 }
 
 void DBIter::Seek(const Slice& target) {
-  direction_ = kForward;
-  ClearSavedValue();
   saved_key_.clear();
   AppendInternalKey(
       &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
   iter_->Seek(saved_key_);
   if (iter_->Valid()) {
+    direction_ = kForward;
+    ClearSavedValue();
     FindNextUserEntry(false /*not skipping */);
   } else {
     valid_ = false;

From b59d4d5a5051263b4bfcef00913219ffe4654e42 Mon Sep 17 00:00:00 2001
From: Siying Dong <sdong@fb.com>
Date: Mon, 28 Oct 2013 20:34:02 -0700
Subject: [PATCH 04/70] A Simple Plain Table

Summary:
A Simple plain table format. No block structure. When creating the table reader, scanning the full table to create indexes.

Test Plan:Add unit test

Reviewers:haobo,dhruba,kailiu

CC:

Task ID: #

Blame Rev:
---
 Makefile                              |   7 +-
 db/plain_table_db_test.cc             | 332 ++++++++++++++++++++++++
 include/rocksdb/plain_table_factory.h |  69 +++++
 table/plain_table_builder.cc          |  77 ++++++
 table/plain_table_builder.h           |  91 +++++++
 table/plain_table_factory.cc          |  31 +++
 table/plain_table_reader.cc           | 358 ++++++++++++++++++++++++++
 table/plain_table_reader.h            | 168 ++++++++++++
 table/table_reader_bench.cc           |  14 +-
 util/env_posix.cc                     |   4 +-
 10 files changed, 1148 insertions(+), 3 deletions(-)
 create mode 100644 db/plain_table_db_test.cc
 create mode 100644 include/rocksdb/plain_table_factory.h
 create mode 100644 table/plain_table_builder.cc
 create mode 100644 table/plain_table_builder.h
 create mode 100644 table/plain_table_factory.cc
 create mode 100644 table/plain_table_reader.cc
 create mode 100644 table/plain_table_reader.h

diff --git a/Makefile b/Makefile
index be7758de9..6fa2864eb 100644
--- a/Makefile
+++ b/Makefile
@@ -72,6 +72,7 @@ TESTS = \
 	merge_test \
 	redis_test \
 	reduce_levels_test \
+	plain_table_db_test \
 	simple_table_db_test \
 	skiplist_test \
 	stringappend_test \
@@ -90,6 +91,7 @@ TOOLS = \
 	db_repl_stress \
 	blob_store_bench
 
+
 PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
 BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
 
@@ -260,11 +262,14 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
 table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
 
 perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
new file mode 100644
index 000000000..3697b4c45
--- /dev/null
+++ b/db/plain_table_db_test.cc
@@ -0,0 +1,332 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/db_statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/plain_table_factory.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+
+class PlainTableDBTest {
+protected:
+public:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  PlainTableDBTest() :
+      env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/plain_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~PlainTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(new PlainTableFactory(16, 8));
+    options.allow_mmap_reads = true;
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(PlainTableDBTest, Empty) {
+  ASSERT_TRUE(db_ != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+TEST(PlainTableDBTest, ReadWrite) {
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(PlainTableDBTest, Flush) {
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(PlainTableDBTest, Iterator) {
+  ASSERT_OK(Put("1000000000foo002", "v_2"));
+  ASSERT_OK(Put("0000000000000bar", "random"));
+  ASSERT_OK(Put("1000000000foo001", "v1"));
+  ASSERT_OK(Put("3000000000000bar", "bar_v"));
+  ASSERT_OK(Put("1000000000foo003", "v__3"));
+  ASSERT_OK(Put("1000000000foo004", "v__4"));
+  ASSERT_OK(Put("1000000000foo005", "v__5"));
+  ASSERT_OK(Put("1000000000foo007", "v__7"));
+  ASSERT_OK(Put("1000000000foo008", "v__8"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get("1000000000foo001"));
+  ASSERT_EQ("v__3", Get("1000000000foo003"));
+  ReadOptions ro;
+  Iterator* iter = dbfull()->NewIterator(ro);
+  iter->Seek("1000000000foo001");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo001", iter->key().ToString());
+  ASSERT_EQ("v1", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo002", iter->key().ToString());
+  ASSERT_EQ("v_2", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo003", iter->key().ToString());
+  ASSERT_EQ("v__3", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo004", iter->key().ToString());
+  ASSERT_EQ("v__4", iter->value().ToString());
+
+  iter->Seek("3000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+  ASSERT_EQ("bar_v", iter->value().ToString());
+
+  iter->Seek("1000000000foo000");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo001", iter->key().ToString());
+  ASSERT_EQ("v1", iter->value().ToString());
+
+  iter->Seek("1000000000foo005");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo006");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo007", iter->key().ToString());
+  ASSERT_EQ("v__7", iter->value().ToString());
+
+  iter->Seek("1000000000foo008");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Seek("1000000000foo009");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+
+  delete iter;
+}
+
+TEST(PlainTableDBTest, Flush2) {
+  ASSERT_OK(Put("0000000000000bar", "b"));
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_OK(Put("1000000000000foo", "v2"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v2", Get("1000000000000foo"));
+
+  ASSERT_OK(Put("0000000000000eee", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000eee"));
+
+  ASSERT_OK(Delete("0000000000000bar"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("0000000000000eee", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v5", Get("0000000000000eee"));
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(PlainTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/include/rocksdb/plain_table_factory.h b/include/rocksdb/plain_table_factory.h
new file mode 100644
index 000000000..f8a0cb9a9
--- /dev/null
+++ b/include/rocksdb/plain_table_factory.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+struct Options;
+struct EnvOptions;
+
+using std::unique_ptr;
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// IndexedTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +--------------------------------------------+  <= key1 offset
+// | key1            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | key2            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+   <= index_block_offset
+// | key1            | key1 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key2            | key2 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key3            | key3 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// |        ......                              |
+// +-----------------+------------+-------------+
+class PlainTableFactory: public TableFactory {
+public:
+  ~PlainTableFactory() {
+  }
+  PlainTableFactory(int user_key_size, int key_prefix_len) :
+      user_key_size_(user_key_size), key_prefix_len_(key_prefix_len) {
+  }
+  const char* Name() const override {
+    return "PlainTable";
+  }
+  Status GetTableReader(const Options& options, const EnvOptions& soptions,
+                        unique_ptr<RandomAccessFile> && file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override;
+
+  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                                CompressionType compression_type) const
+                                    override;
+private:
+  int user_key_size_;
+  int key_prefix_len_;
+};
+
+}  // namespace rocksdb
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
new file mode 100644
index 000000000..ed0b4d988
--- /dev/null
+++ b/table/plain_table_builder.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_builder.h"
+
+#include <assert.h>
+#include <map>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "table/block_builder.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+PlainTableBuilder::PlainTableBuilder(const Options& options,
+                                     WritableFile* file,
+                                     int user_key_size, int key_prefix_len) :
+    options_(options), file_(file), user_key_size_(user_key_size),
+    key_prefix_len_(key_prefix_len) {
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+}
+
+Status PlainTableBuilder::ChangeOptions(const Options& options) {
+  return Status::OK();
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+  assert((int) key.size() == GetInternalKeyLength());
+
+  // Write key-value pair
+  file_->Append(key);
+  offset_ += GetInternalKeyLength();
+
+  std::string size;
+  int value_size = value.size();
+  PutFixed32(&size, value_size);
+  Slice sizeSlice(size);
+  file_->Append(sizeSlice);
+  file_->Append(value);
+  offset_ += value_size + 4;
+
+  num_entries_++;
+}
+
+Status PlainTableBuilder::status() const {
+  return Status::OK();
+}
+
+Status PlainTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+  return Status::OK();
+}
+
+void PlainTableBuilder::Abandon() {
+  closed_ = true;
+}
+
+uint64_t PlainTableBuilder::NumEntries() const {
+  return num_entries_;
+}
+
+uint64_t PlainTableBuilder::FileSize() const {
+  return offset_;
+}
+
+}  // namespace rocksdb
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
new file mode 100644
index 000000000..b48552efc
--- /dev/null
+++ b/table/plain_table_builder.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+class PlainTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  PlainTableBuilder(const Options& options, WritableFile* file,
+                    int user_key_size, int key_prefix_len);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~PlainTableBuilder();
+
+  // Change the options used by this builder.  Note: only some of the
+  // option fields can be changed after construction.  If a field is
+  // not allowed to change dynamically and its value in the structure
+  // passed to the constructor is different from its value in the
+  // structure passed to this method, this method will return an error
+  // without changing any fields.
+  Status ChangeOptions(const Options& options);
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  Options options_;
+  WritableFile* file_;
+  uint64_t offset_ = 0;
+  Status status_;
+  uint64_t num_entries_ = 0;
+
+  const size_t user_key_size_;
+  const size_t key_prefix_len_;
+  bool closed_ = false;  // Either Finish() or Abandon() has been called.
+
+  int GetInternalKeyLength() {
+    return user_key_size_ + 8;
+  }
+
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
+};
+
+}  // namespace rocksdb
+
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
new file mode 100644
index 000000000..10393501d
--- /dev/null
+++ b/table/plain_table_factory.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/plain_table_factory.h"
+
+#include <memory>
+#include <stdint.h>
+#include "table/plain_table_builder.h"
+#include "table/plain_table_reader.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+Status PlainTableFactory::GetTableReader(const Options& options,
+                                         const EnvOptions& soptions,
+                                         unique_ptr<RandomAccessFile> && file,
+                                         uint64_t file_size,
+                                         unique_ptr<TableReader>* table)
+     const {
+  return PlainTableReader::Open(options, soptions, std::move(file), file_size,
+                                  table, user_key_size_, key_prefix_len_);
+}
+
+TableBuilder* PlainTableFactory::GetTableBuilder(
+    const Options& options, WritableFile* file,
+    CompressionType compression_type) const {
+  return new PlainTableBuilder(options, file, user_key_size_,
+                                 key_prefix_len_);
+}
+}  // namespace rocksdb
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
new file mode 100644
index 000000000..5577c4eca
--- /dev/null
+++ b/table/plain_table_reader.cc
@@ -0,0 +1,358 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_reader.h"
+
+#include <unordered_map>
+
+#include "db/dbformat.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+
+#include "table/block.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/two_level_iterator.h"
+
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/histogram.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+
+namespace std {
+template<>
+struct hash<rocksdb::Slice> {
+public:
+  std::size_t operator()(rocksdb::Slice const& s) const {
+    return rocksdb::Hash(s.data(), s.size(), 397);
+  }
+};
+}
+
+namespace rocksdb {
+
+PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
+                                   uint64_t file_size, int user_key_size,
+                                   int key_prefix_len) :
+    soptions_(storage_options), file_size_(file_size),
+    user_key_size_(user_key_size), key_prefix_len_(key_prefix_len) {
+  hash_table_ = nullptr;
+}
+
+PlainTableReader::~PlainTableReader() {
+  if (hash_table_ != nullptr) {
+    delete[] hash_table_;
+  }
+}
+
+Status PlainTableReader::Open(const Options& options,
+                              const EnvOptions& soptions,
+                              unique_ptr<RandomAccessFile> && file,
+                              uint64_t file_size,
+                              unique_ptr<TableReader>* table_reader,
+                              const int user_key_size,
+                              const int key_prefix_len) {
+  assert(options.allow_mmap_reads);
+
+  PlainTableReader* t = new PlainTableReader(soptions, file_size,
+                                             user_key_size,
+                                             key_prefix_len);
+  t->file_ = std::move(file);
+  t->options_ = options;
+  Status s = t->PopulateIndex(file_size);
+  if (!s.ok()) {
+    delete t;
+    return s;
+  }
+  table_reader->reset(t);
+  return s;
+}
+
+void PlainTableReader::SetupForCompaction() {
+}
+
+bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) {
+  return true;
+}
+
+Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
+  return new PlainTableIterator(this);
+}
+
+Status PlainTableReader::PopulateIndex(uint64_t file_size) {
+  Slice key_slice;
+  Slice key_prefix_slice;
+  Slice key_suffix_slice;
+  Slice value_slice;
+  Slice tmp_slice;
+  Slice prev_key_prefix_slice;
+  uint64_t pos = 0;
+  uint64_t data_offset_for_cur_prefix = 0;
+  int count_prefix = 0;
+  bool first = true;
+  std::string prefix_sub_index;
+  HistogramImpl keys_per_prefix_hist;
+  std::unordered_map<Slice, uint64_t> tmp_index;
+
+  while (pos < file_size) {
+    uint64_t key_offset = pos;
+    pos = Next(pos, &key_slice, &value_slice, &tmp_slice);
+    key_prefix_slice = Slice(key_slice.data(), key_prefix_len_);
+
+    if (first || prev_key_prefix_slice != key_prefix_slice) {
+      if (!first) {
+        if (count_prefix < 8 || key_prefix_len_ == user_key_size_) {
+          tmp_index[prev_key_prefix_slice] = data_offset_for_cur_prefix;
+        } else {
+          tmp_index[prev_key_prefix_slice] = sub_index_.length()
+              | kSubIndexMask;
+          PutFixed32(&sub_index_, (count_prefix - 1) / 8 + 1);
+          sub_index_.append(prefix_sub_index);
+        }
+        prefix_sub_index.clear();
+        data_offset_for_cur_prefix = key_offset;
+        keys_per_prefix_hist.Add(count_prefix);
+      }
+      prev_key_prefix_slice = key_prefix_slice;
+      count_prefix = 1;
+    } else {
+      count_prefix++;
+    }
+    if (key_prefix_len_ < user_key_size_ && count_prefix % 8 == 1) {
+      prefix_sub_index.append(key_slice.data() + key_prefix_len_,
+                              user_key_size_ - key_prefix_len_);
+      PutFixed64(&prefix_sub_index, key_offset);
+    }
+
+    first = false;
+  }
+  keys_per_prefix_hist.Add(count_prefix);
+  if (count_prefix <= 2 || key_prefix_len_ == user_key_size_) {
+    tmp_index[prev_key_prefix_slice] = data_offset_for_cur_prefix;
+  } else {
+    tmp_index[prev_key_prefix_slice] = sub_index_.length() | kSubIndexMask;
+    PutFixed32(&sub_index_, (count_prefix - 1) / 8 + 1);
+    sub_index_.append(prefix_sub_index);
+  }
+
+  if (hash_table_ != nullptr) {
+    delete[] hash_table_;
+  }
+  // Make the hash table 3/5 full
+  hash_table_size_ = tmp_index.size() * 1.66;
+  hash_table_ = new char[GetHashTableRecordLen() * hash_table_size_];
+  for (int i = 0; i < hash_table_size_; i++) {
+    memcpy(GetHashTableBucketPtr(i) + key_prefix_len_, &file_size_,
+           kOffsetLen);
+  }
+
+  for (auto it = tmp_index.begin(); it != tmp_index.end(); ++it) {
+    int bucket = GetHashTableBucket(it->first);
+    uint64_t* hash_value;
+    while (true) {
+      GetHashValue(bucket, &hash_value);
+      if (*hash_value == file_size_) {
+        break;
+      }
+      bucket = (bucket + 1) % hash_table_size_;
+    }
+
+    char* bucket_ptr = GetHashTableBucketPtr(bucket);
+    memcpy(bucket_ptr, it->first.data(), key_prefix_len_);
+    memcpy(bucket_ptr + key_prefix_len_, &it->second, kOffsetLen);
+  }
+
+  Log(options_.info_log, "Number of prefixes: %d, suffix_map length %ld",
+      hash_table_size_, sub_index_.length());
+  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+      keys_per_prefix_hist.ToString().c_str());
+
+  return Status::OK();
+}
+
+inline int PlainTableReader::GetHashTableBucket(Slice key) {
+  return rocksdb::Hash(key.data(), key_prefix_len_, 397) % hash_table_size_;
+}
+
+inline void PlainTableReader::GetHashValue(int bucket, uint64_t** ret_value) {
+  *ret_value = (uint64_t*) (GetHashTableBucketPtr(bucket) + key_prefix_len_);
+}
+
+Status PlainTableReader::GetOffset(const Slice& target, uint64_t* offset) {
+  Status s;
+
+  int bucket = GetHashTableBucket(target);
+  uint64_t* found_value;
+  Slice hash_key;
+  while (true) {
+    GetHashValue(bucket, &found_value);
+    if (*found_value == file_size_) {
+      break;
+    }
+    GetHashKey(bucket, &hash_key);
+    if (target.starts_with(hash_key)) {
+      break;
+    }
+    bucket = (bucket + 1) % hash_table_size_;
+  }
+
+  if (*found_value == file_size_ || (*found_value & kSubIndexMask) == 0) {
+    *offset = *found_value;
+    return Status::OK();
+  }
+
+  uint32_t low = 0;
+  uint64_t prefix_index_offset = *found_value ^ kSubIndexMask;
+  uint32_t high = DecodeFixed32(sub_index_.data() + prefix_index_offset);
+  uint64_t base_offset = prefix_index_offset + 4;
+  char* mid_key_str = new char[target.size()];
+  memcpy(mid_key_str, target.data(), target.size());
+  Slice mid_key = Slice(mid_key_str, target.size());
+
+  // The key is between (low, high). Do a binary search between it.
+  while (high - low > 1) {
+    uint32_t mid = (high + low) / 2;
+    const char* base = sub_index_.data() + base_offset
+        + (user_key_size_ - key_prefix_len_ + kOffsetLen) * mid;
+    memcpy(mid_key_str + key_prefix_len_, base,
+           user_key_size_ - key_prefix_len_);
+
+    int cmp_result = options_.comparator->Compare(target, mid_key);
+    if (cmp_result > 0) {
+      low = mid;
+    } else {
+      if (cmp_result == 0) {
+        // Happen to have found the exact key or target is smaller than the
+        // first key after base_offset.
+        *offset = DecodeFixed64(base + user_key_size_ - key_prefix_len_);
+        delete[] mid_key_str;
+        return s;
+      } else {
+        high = mid;
+      }
+    }
+  }
+
+  const char* base = sub_index_.data() + base_offset
+      + (user_key_size_ - key_prefix_len_ + kOffsetLen) * low;
+  *offset = DecodeFixed64(base + user_key_size_ - key_prefix_len_);
+
+  delete[] mid_key_str;
+  return s;
+}
+
+uint64_t PlainTableReader::Next(uint64_t offset, Slice* key, Slice* value,
+                                Slice* tmp_slice) {
+  if (offset >= file_size_) {
+    return file_size_;
+  }
+  int internal_key_size = GetInternalKeyLength();
+
+  Status s = file_->Read(offset, internal_key_size, key, nullptr);
+  offset += internal_key_size;
+
+  s = file_->Read(offset, 4, tmp_slice, nullptr);
+  offset += 4;
+  uint32_t value_size = DecodeFixed32(tmp_slice->data());
+
+  s = file_->Read(offset, value_size, value, nullptr);
+  offset += value_size;
+
+  return offset;
+}
+
+Status PlainTableReader::Get(
+    const ReadOptions& ro, const Slice& target, void* arg,
+    bool (*saver)(void*, const Slice&, const Slice&, bool),
+    void (*mark_key_may_exist)(void*)) {
+  uint64_t offset;
+  Status s = GetOffset(target, &offset);
+  if (!s.ok()) {
+    return s;
+  }
+  Slice found_key;
+  Slice found_value;
+  Slice tmp_slice;
+  while (offset < file_size_) {
+    offset = Next(offset, &found_key, &found_value, &tmp_slice);
+    if (options_.comparator->Compare(found_key, target) >= 0
+        && !(*saver)(arg, found_key, found_value, true)) {
+      break;
+    }
+  }
+  return s;
+}
+
+bool PlainTableReader::TEST_KeyInCache(const ReadOptions& options,
+                                       const Slice& key) {
+  return false;
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table) :
+    table_(table) {
+  SeekToFirst();
+}
+
+PlainTableIterator::~PlainTableIterator() {
+}
+
+bool PlainTableIterator::Valid() const {
+  return offset_ < table_->file_size_ && offset_ >= 0;
+}
+
+void PlainTableIterator::SeekToFirst() {
+  next_offset_ = 0;
+  Next();
+}
+
+void PlainTableIterator::SeekToLast() {
+  assert(false);
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+  Status s = table_->GetOffset(target, &next_offset_);
+  if (!s.ok()) {
+    status_ = s;
+  }
+  if (next_offset_ < table_->file_size_) {
+    for (Next();
+        Valid() && table_->options_.comparator->Compare(key(), target) < 0;
+        Next()) {
+    }
+  }
+}
+
+void PlainTableIterator::Next() {
+  offset_ = next_offset_;
+  Slice tmp_slice;
+  next_offset_ = table_->Next(next_offset_, &key_, &value_, &tmp_slice);
+}
+
+void PlainTableIterator::Prev() {
+  assert(false);
+}
+
+Slice PlainTableIterator::key() const {
+  return key_;
+}
+
+Slice PlainTableIterator::value() const {
+  return value_;
+}
+
+Status PlainTableIterator::status() const {
+  return status_;
+}
+
+}  // namespace rocksdb
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
new file mode 100644
index 000000000..44b545833
--- /dev/null
+++ b/table/plain_table_reader.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <unordered_map>
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+class Block;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+
+using std::unique_ptr;
+using std::unordered_map;
+
+// Based on following output file format:
+// +--------------------------------------------+  <= key1_data_offset
+// | key1            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2_data_offset
+// | key2            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+   <= index_block_offset
+// | key1            | key1 offset (8 bytes)    |
+// +-----------------+--------------------------+   <= key2_index_offset
+// | key2            | key2 offset (8 bytes)    |
+// +-----------------+--------------------------+   <= key3_index_offset
+// | key3            | key3 offset (8 bytes)    |
+// +-----------------+--------------------------+   <= key4_index_offset
+// |        ......                              |
+// +-----------------+------------+-------------+
+// When opening the output file, IndexedTableReader creates a hash table
+// from key prefixes to offset of the output file. IndexedTable will decide
+// whether it points to the data offset of the first key with the key prefix
+// or the offset of it. If there are too many keys share this prefix, it will
+// create a binary search-able index from the suffix to offset on disk.
+//
+// The implementation of IndexedTableReader requires output file is mmaped
+class PlainTableReader: public TableReader {
+public:
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+                     unique_ptr<TableReader>* table, const int user_key_size,
+                     const int key_prefix_len);
+
+  bool PrefixMayMatch(const Slice& internal_prefix);
+
+  Iterator* NewIterator(const ReadOptions&);
+
+  Status Get(
+      const ReadOptions&, const Slice& key, void* arg,
+      bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
+      void (*mark_key_may_exist)(void*) = nullptr);
+
+  uint64_t ApproximateOffsetOf(const Slice& key);
+
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
+
+  void SetupForCompaction();
+
+  TableProperties& GetTableProperties() {
+    return tbl_props;
+  }
+
+  PlainTableReader(const EnvOptions& storage_options, uint64_t file_size,
+                   int user_key_size, int key_prefix_len);
+  ~PlainTableReader();
+
+private:
+  char* hash_table_;
+  int hash_table_size_;
+  std::string sub_index_;
+
+  Options options_;
+  const EnvOptions& soptions_;
+  Status status_;
+  unique_ptr<RandomAccessFile> file_;
+
+  uint64_t file_size_;
+  const size_t user_key_size_;
+  const size_t key_prefix_len_;
+
+  TableProperties tbl_props;
+
+  static const size_t kNumInternalBytes = 8;
+  static const uint64_t kSubIndexMask = 0x8000000000000000;
+  static const size_t kOffsetLen = sizeof(uint64_t);
+
+  inline int GetHashTableBucket(Slice key);
+  inline size_t GetInternalKeyLength() {
+    return user_key_size_ + kNumInternalBytes;
+  }
+  inline size_t GetHashTableRecordLen() {
+    return key_prefix_len_ + kOffsetLen;
+  }
+  inline char* GetHashTableBucketPtr(int bucket) {
+    return hash_table_ + GetHashTableRecordLen() * bucket;
+  }
+  inline void GetHashKey(int bucket, Slice* slice) {
+    *slice = Slice(GetHashTableBucketPtr(bucket), key_prefix_len_);
+  }
+  inline void GetHashValue(int bucket, uint64_t** ret_value);
+
+  friend class TableCache;
+  friend class PlainTableIterator;
+
+  Status PopulateIndex(uint64_t file_size);
+  uint64_t Next(uint64_t offset, Slice* key, Slice* value, Slice* tmp_slice);
+  Status GetOffset(const Slice& target, uint64_t* offset);
+
+  // No copying allowed
+  explicit PlainTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator: public Iterator {
+public:
+  explicit PlainTableIterator(PlainTableReader* table);
+  ~PlainTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+private:
+  PlainTableReader* table_;
+  uint64_t offset_;
+  uint64_t next_offset_;
+  Slice key_;
+  Slice value_;
+  Status status_;
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index e7b6b0b7a..8d3fd2412 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -6,12 +6,14 @@
 #include <gflags/gflags.h>
 
 #include "rocksdb/db.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "port/atomic_pointer.h"
 #include "table/block_based_table_factory.h"
+#include "rocksdb/plain_table_factory.h"
 #include "util/histogram.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -218,6 +220,8 @@ DEFINE_bool(iterator, false, "For test iterator");
 DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
             "the query will be against DB. Otherwise, will be directly against "
             "a table reader.");
+DEFINE_bool(plain_table, false, "Use PlainTable");
+
 
 int main(int argc, char** argv) {
   google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
@@ -230,10 +234,18 @@ int main(int argc, char** argv) {
     options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
         FLAGS_prefix_len);
   }
-  options.SetUpDefaultFlushBlockPolicyFactory();
   rocksdb::ReadOptions ro;
   rocksdb::EnvOptions env_options;
   options.create_if_missing = true;
+  options.compression = rocksdb::CompressionType::kNoCompression;
+
+  if (FLAGS_plain_table) {
+    options.allow_mmap_reads = true;
+    env_options.use_mmap_reads = true;
+    tf = new rocksdb::PlainTableFactory(16, FLAGS_prefix_len);
+  } else {
+    tf = new rocksdb::BlockBasedTableFactory();
+  }
   options.table_factory =
       std::shared_ptr<rocksdb::TableFactory>(tf);
   TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 356008225..c6995b30c 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -306,7 +306,9 @@ class PosixMmapReadableFile: public RandomAccessFile {
     assert(options.use_mmap_reads);
     assert(options.use_os_buffer);
   }
-  virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
+  virtual ~PosixMmapReadableFile() {
+    assert(munmap(mmapped_region_, length_) == 0);
+  }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {

From 7b10fe9fac7a716e80216b7602e67be577a66c77 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Wed, 20 Nov 2013 13:45:32 -0800
Subject: [PATCH 05/70] Fix a memory leak happened in table_test

---
 table/table_test.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/table/table_test.cc b/table/table_test.cc
index e93e9bcec..394aa4b9d 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -252,11 +252,14 @@ class BlockBasedTableConstructor: public Constructor {
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
     Reset();
     sink_.reset(new StringSink());
+    std::unique_ptr<FlushBlockBySizePolicyFactory> flush_policy_factory(
+        new FlushBlockBySizePolicyFactory(options.block_size,
+                                          options.block_size_deviation));
+
     BlockBasedTableBuilder builder(
         options,
         sink_.get(),
-        new FlushBlockBySizePolicyFactory(
-          options.block_size, options.block_size_deviation),
+        flush_policy_factory.get(),
         options.compression);
 
     for (KVMap::const_iterator it = data.begin();

From 58e1956d50224175299dc50b6bea9b22cbda884f Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Wed, 20 Nov 2013 19:49:27 -0800
Subject: [PATCH 06/70] [Only for Performance Branch] A Hacky patch to lazily
 generate memtable key for prefix-hashed memtables.

Summary:
For prefix mem tables, encoding mem table key may be unnecessary if the prefix doesn't have any key. This patch is a little bit hacky but I want to try out the performance gain of removing this lazy initialization.

In longer term, we might want to revisit the way we abstract mem tables implementations.

Test Plan: make all check

Reviewers: haobo, igor, kailiu

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14265
---
 db/memtable.cc                |  9 ++++-----
 db/memtable.h                 |  2 ++
 include/rocksdb/memtablerep.h |  2 +-
 util/hash_skiplist_rep.cc     | 18 +++++++++++-------
 util/skiplistrep.cc           | 11 +++++++++--
 util/transformrep.cc          | 20 ++++++++++++--------
 util/vectorrep.cc             | 11 ++++++++---
 7 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index 291899c21..44eb160e7 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -75,7 +75,7 @@ Slice MemTableRep::UserKey(const char* key) const {
 // Encode a suitable internal key target for "target" and return it.
 // Uses *scratch as scratch space, and the returned pointer will point
 // into this scratch space.
-static const char* EncodeKey(std::string* scratch, const Slice& target) {
+const char* EncodeKey(std::string* scratch, const Slice& target) {
   scratch->clear();
   PutVarint32(scratch, target.size());
   scratch->append(target.data(), target.size());
@@ -96,7 +96,7 @@ class MemTableIterator: public Iterator {
   }
 
   virtual bool Valid() const { return iter_->Valid(); }
-  virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
+  virtual void Seek(const Slice& k) { iter_->Seek(k, nullptr); }
   virtual void SeekToFirst() { iter_->SeekToFirst(); }
   virtual void SeekToLast() { iter_->SeekToLast(); }
   virtual void Next() { iter_->Next(); }
@@ -113,7 +113,6 @@ class MemTableIterator: public Iterator {
 
  private:
   std::shared_ptr<MemTableRep::Iterator> iter_;
-  std::string tmp_;       // For passing to EncodeKey
 
   // No copying allowed
   MemTableIterator(const MemTableIterator&);
@@ -165,7 +164,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   Slice memkey = key.memtable_key();
   std::shared_ptr<MemTableRep::Iterator> iter(
     table_->GetIterator(key.user_key()));
-  iter->Seek(memkey.data());
+  iter->Seek(key.user_key(), memkey.data());
 
   // It is the caller's responsibility to allocate/delete operands list
   assert(operands != nullptr);
@@ -274,7 +273,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
 
   std::shared_ptr<MemTableRep::Iterator> iter(
     table_.get()->GetIterator(lkey.user_key()));
-  iter->Seek(memkey.data());
+  iter->Seek(key, memkey.data());
 
   if (iter->Valid()) {
     // entry format is:
diff --git a/db/memtable.h b/db/memtable.h
index 93b9b7e2c..9efb16431 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -169,4 +169,6 @@ class MemTable {
   port::RWMutex* GetLock(const Slice& key);
 };
 
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 4147e5f3a..83b15d256 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -107,7 +107,7 @@ class MemTableRep {
     virtual void Prev() = 0;
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) = 0;
+    virtual void Seek(const Slice& user_key, const char* memtable_key) = 0;
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index b67911f93..290ce9d63 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -11,6 +11,7 @@
 #include "port/port.h"
 #include "port/atomic_pointer.h"
 #include "util/murmurhash.h"
+#include "db/memtable.h"
 #include "db/skiplist.h"
 
 namespace rocksdb {
@@ -112,9 +113,12 @@ class HashSkipListRep : public MemTableRep {
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) {
+    virtual void Seek(const Slice& user_key, const char* memtable_key) {
       if (list_ != nullptr) {
-        iter_.Seek(target);
+        const char* encoded_key =
+            (memtable_key != nullptr) ?
+                memtable_key : EncodeKey(&tmp_, user_key);
+        iter_.Seek(encoded_key);
       }
     }
 
@@ -151,6 +155,7 @@ class HashSkipListRep : public MemTableRep {
     // here we track if we own list_. If we own it, we are also
     // responsible for it's cleaning. This is a poor man's shared_ptr
     bool own_list_;
+    std::string tmp_;       // For passing to EncodeKey
   };
 
   class DynamicIterator : public HashSkipListRep::Iterator {
@@ -160,11 +165,10 @@ class HashSkipListRep : public MemTableRep {
         memtable_rep_(memtable_rep) {}
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) {
-      auto transformed = memtable_rep_.transform_->Transform(
-        memtable_rep_.UserKey(target));
+    virtual void Seek(const Slice& k, const char* memtable_key) {
+      auto transformed = memtable_rep_.transform_->Transform(k);
       Reset(memtable_rep_.GetBucket(transformed));
-      HashSkipListRep::Iterator::Seek(target);
+      HashSkipListRep::Iterator::Seek(k, memtable_key);
     }
 
     // Position at the first entry in collection.
@@ -201,7 +205,7 @@ class HashSkipListRep : public MemTableRep {
     }
     virtual void Next() { }
     virtual void Prev() { }
-    virtual void Seek(const char* target) { }
+    virtual void Seek(const Slice& user_key, const char* memtable_key) { }
     virtual void SeekToFirst() { }
     virtual void SeekToLast() { }
    private:
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index 955d754b1..f4c6e0c93 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -70,8 +70,13 @@ public:
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) override {
-      iter_.Seek(target);
+    virtual void Seek(const Slice& user_key, const char* memtable_key)
+        override {
+      if (memtable_key != nullptr) {
+        iter_.Seek(memtable_key);
+      } else {
+        iter_.Seek(EncodeKey(&tmp_, user_key));
+      }
     }
 
     // Position at the first entry in list.
@@ -85,6 +90,8 @@ public:
     virtual void SeekToLast() override {
       iter_.SeekToLast();
     }
+   protected:
+    std::string tmp_;       // For passing to EncodeKey
   };
 
   // Unhide default implementations of GetIterator
diff --git a/util/transformrep.cc b/util/transformrep.cc
index 4c7df1321..ef1205570 100644
--- a/util/transformrep.cc
+++ b/util/transformrep.cc
@@ -13,6 +13,7 @@
 #include "rocksdb/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "db/memtable.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/murmurhash.h"
@@ -110,7 +111,7 @@ class TransformRep : public MemTableRep {
     virtual void Prev();
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target);
+    virtual void Seek(const Slice& user_key, const char* memtable_key);
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
@@ -122,6 +123,7 @@ class TransformRep : public MemTableRep {
    private:
     std::shared_ptr<Bucket> items_;
     Bucket::const_iterator cit_;
+    std::string tmp_;       // For passing to EncodeKey
   };
 
   class EmptyIterator : public MemTableRep::Iterator {
@@ -137,7 +139,7 @@ class TransformRep : public MemTableRep {
     }
     virtual void Next() { }
     virtual void Prev() { }
-    virtual void Seek(const char* target) { }
+    virtual void Seek(const Slice& user_key, const char* memtable_key) { }
     virtual void SeekToFirst() { }
     virtual void SeekToLast() { }
     static std::shared_ptr<EmptyIterator> GetInstance();
@@ -197,9 +199,8 @@ class TransformRep : public MemTableRep {
 
     // Advance to the first entry with a key >= target within the
     // same bucket as target
-    virtual void Seek(const char* target) {
-      Slice prefix = memtable_rep_.transform_->Transform(
-        memtable_rep_.UserKey(target));
+    virtual void Seek(const Slice& user_key, const char* memtable_key) {
+      Slice prefix = memtable_rep_.transform_->Transform(user_key);
 
       ReadLock l(&memtable_rep_.rwlock_);
       auto bucket = memtable_rep_.buckets_.find(prefix);
@@ -208,7 +209,7 @@ class TransformRep : public MemTableRep {
       } else {
         bucket_iterator_.reset(
           new TransformIterator(bucket->second, memtable_rep_.GetLock(prefix)));
-        bucket_iterator_->Seek(target);
+        bucket_iterator_->Seek(user_key, memtable_key);
       }
     }
 
@@ -343,8 +344,11 @@ void TransformRep::Iterator::Prev() {
 }
 
 // Advance to the first entry with a key >= target
-void TransformRep::Iterator::Seek(const char* target) {
-  cit_ = items_->lower_bound(target);
+void TransformRep::Iterator::Seek(const Slice& user_key,
+                                  const char* memtable_key) {
+  const char* encoded_key =
+    (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
+  cit_ = items_->lower_bound(encoded_key);
 }
 
 // Position at the first entry in collection.
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index 8d3ccc9df..3887f356d 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -12,6 +12,7 @@
 #include <type_traits>
 
 #include "rocksdb/arena.h"
+#include "db/memtable.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/stl_wrappers.h"
@@ -45,6 +46,7 @@ class VectorRep : public MemTableRep {
     std::shared_ptr<std::vector<const char*>> bucket_;
     typename std::vector<const char*>::const_iterator mutable cit_;
     const KeyComparator& compare_;
+    std::string tmp_;       // For passing to EncodeKey
     bool mutable sorted_;
     void DoSort() const;
    public:
@@ -73,7 +75,7 @@ class VectorRep : public MemTableRep {
     virtual void Prev() override;
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) override;
+    virtual void Seek(const Slice& user_key, const char* memtable_key) override;
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
@@ -200,12 +202,15 @@ void VectorRep::Iterator::Prev() {
 }
 
 // Advance to the first entry with a key >= target
-void VectorRep::Iterator::Seek(const char* target) {
+void VectorRep::Iterator::Seek(const Slice& user_key,
+                               const char* memtable_key) {
   DoSort();
   // Do binary search to find first value not less than the target
+  const char* encoded_key =
+      (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
   cit_ = std::equal_range(bucket_->begin(),
                           bucket_->end(),
-                          target,
+                          encoded_key,
                           [this] (const char* a, const char* b) {
                             return compare_(a, b) < 0;
                           }).first;

From b135d01e7bcdf4186ea852a5b4e6d14a3a815d77 Mon Sep 17 00:00:00 2001
From: Siying Dong <sdong@fb.com>
Date: Mon, 18 Nov 2013 11:32:54 -0800
Subject: [PATCH 07/70] Allow users to profile a query and see bottleneck of
 the query

Summary:
Provide a framework to profile a query in detail to figure out latency bottleneck. Currently, in Get(), Put() and iterators, 2-3 simple timing is used. We can easily add more profile counters to the framework later.

Test Plan: Enable this profiling in seveal existing tests.

Reviewers: haobo, dhruba, kailiu, emayanke, vamsi, igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14001

Conflicts:
	table/merger.cc
---
 db/db_bench.cc                 |  8 ++++++-
 db/db_impl.cc                  | 34 +++++++++++++++++++++++++---
 db/db_iter.cc                  | 22 ++++++++++++++++--
 db/db_test.cc                  | 41 ++++++++++++++++++++++++++++++++++
 db/memtable.cc                 | 20 ++++++++++++-----
 db/perf_context_test.cc        | 34 +++++++++++++++++++++++++---
 db/version_set.cc              |  2 +-
 include/rocksdb/perf_context.h | 22 +++++++++++++++++-
 table/merger.cc                | 28 +++++++++++++++++++----
 table/merger.h                 |  3 ++-
 util/perf_context.cc           | 15 ++++++++++++-
 11 files changed, 207 insertions(+), 22 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 63cc906e7..3ab130093 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -22,6 +22,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/perf_context.h"
 #include "port/port.h"
 #include "util/bit_set.h"
 #include "util/crc32c.h"
@@ -350,6 +351,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
 DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
              " this is greater than 0.");
 
+DEFINE_int32(perf_level, 0, "Level of perf collection");
+
 static bool ValidateRateLimit(const char* flagname, double value) {
   static constexpr double EPSILON = 1e-10;
   if ( value < -EPSILON ) {
@@ -689,6 +692,7 @@ struct SharedState {
   port::Mutex mu;
   port::CondVar cv;
   int total;
+  int perf_level;
 
   // Each thread goes through the following states:
   //    (1) initializing
@@ -700,7 +704,7 @@ struct SharedState {
   long num_done;
   bool start;
 
-  SharedState() : cv(&mu) { }
+  SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
 };
 
 // Per-thread state for concurrent executions of the same benchmark.
@@ -810,6 +814,7 @@ class Benchmark {
         fprintf(stdout, "Memtablerep: vector\n");
         break;
     }
+    fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
 
     PrintWarnings();
     fprintf(stdout, "------------------------------------------------\n");
@@ -1150,6 +1155,7 @@ class Benchmark {
       }
     }
 
+    SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
     thread->stats.Start(thread->tid);
     (arg->bm->*(arg->method))(thread);
     thread->stats.Stop();
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 5a2f0de4a..a4e28b032 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1035,7 +1035,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
         (unsigned long)m->GetLogNumber());
     list.push_back(m->NewIterator());
   }
-  Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0],
+  Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0],
                                       list.size());
   const SequenceNumber newest_snapshot = snapshots_.GetNewest();
   const SequenceNumber earliest_seqno_in_memtable =
@@ -2519,7 +2519,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   // Collect iterators for files in L0 - Ln
   versions_->current()->AddIterators(options, storage_options_, &list);
   Iterator* internal_iter =
-      NewMergingIterator(&internal_comparator_, &list[0], list.size());
+      NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
   versions_->current()->Ref();
 
   cleanup->mu = &mutex_;
@@ -2555,6 +2555,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   Status s;
 
   StopWatch sw(env_, options_.statistics, DB_GET);
+  StopWatchNano snapshot_timer(env_, false);
+  StartPerfTimer(&snapshot_timer);
   SequenceNumber snapshot;
   mutex_.Lock();
   if (options.snapshot != nullptr) {
@@ -2583,15 +2585,23 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
   LookupKey lkey(key, snapshot);
+  BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
   if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
     // Done
   } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) {
     // Done
   } else {
+    StopWatchNano from_files_timer(env_, false);
+    StartPerfTimer(&from_files_timer);
+
     current->Get(options, lkey, value, &s, &merge_operands, &stats,
                  options_, value_found);
     have_stat_update = true;
+    BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
   }
+
+  StopWatchNano post_process_timer(env_, false);
+  StartPerfTimer(&post_process_timer);
   mutex_.Lock();
 
   if (!options_.disable_seek_compaction &&
@@ -2607,6 +2617,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   // Note, tickers are atomic now - no lock protection needed any more.
   RecordTick(options_.statistics, NUMBER_KEYS_READ);
   RecordTick(options_.statistics, BYTES_READ, value->size());
+  BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
+
   return s;
 }
 
@@ -2615,6 +2627,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
                                      std::vector<std::string>* values) {
 
   StopWatch sw(env_, options_.statistics, DB_MULTIGET);
+  StopWatchNano snapshot_timer(env_, false);
+  StartPerfTimer(&snapshot_timer);
   SequenceNumber snapshot;
   mutex_.Lock();
   if (options.snapshot != nullptr) {
@@ -2646,6 +2660,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
 
   // Keep track of bytes that we read for statistics-recording later
   uint64_t bytesRead = 0;
+  BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
 
   // For each of the given keys, apply the entire "get" process as follows:
   // First look in the memtable, then in the immutable memtable (if any).
@@ -2672,6 +2687,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
   }
 
   // Post processing (decrement reference counts and record statistics)
+  StopWatchNano post_process_timer(env_, false);
+  StartPerfTimer(&post_process_timer);
   mutex_.Lock();
   if (!options_.disable_seek_compaction &&
       have_stat_update && current->UpdateStats(stats)) {
@@ -2686,6 +2703,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
   RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS);
   RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys);
   RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead);
+  BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
 
   return statList;
 }
@@ -2754,6 +2772,8 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
 }
 
 Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+  StopWatchNano pre_post_process_timer(env_, false);
+  StartPerfTimer(&pre_post_process_timer);
   Writer w(&mutex_);
   w.batch = my_batch;
   w.sync = options.sync;
@@ -2800,12 +2820,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
       if (options.disableWAL) {
         flush_on_destroy_ = true;
       }
+      BumpPerfTime(&perf_context.write_pre_and_post_process_time,
+                   &pre_post_process_timer);
 
       if (!options.disableWAL) {
         StopWatchNano timer(env_);
         StartPerfTimer(&timer);
         status = log_->AddRecord(WriteBatchInternal::Contents(updates));
-        BumpPerfTime(&perf_context.wal_write_time, &timer);
         if (status.ok() && options.sync) {
           if (options_.use_fsync) {
             StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS);
@@ -2815,10 +2836,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
             status = log_->file()->Sync();
           }
         }
+        BumpPerfTime(&perf_context.write_wal_time, &timer);
       }
       if (status.ok()) {
+        StopWatchNano write_memtable_timer(env_, false);
+        StartPerfTimer(&write_memtable_timer);
         status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this,
                                                 options_.filter_deletes);
+        BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer);
         if (!status.ok()) {
           // Panic for in-memory corruptions
           // Note that existing logic was not sound. Any partial failure writing
@@ -2828,6 +2853,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         }
         SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence);
       }
+      StartPerfTimer(&pre_post_process_timer);
       LogFlush(options_.info_log);
       mutex_.Lock();
       if (status.ok()) {
@@ -2855,6 +2881,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   if (!writers_.empty()) {
     writers_.front()->cv.Signal();
   }
+  BumpPerfTime(&perf_context.write_pre_and_post_process_time,
+               &pre_post_process_timer);
   return status;
 }
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 7a395db7a..9187313f2 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -102,7 +102,8 @@ class DBIter: public Iterator {
   virtual void SeekToLast();
 
  private:
-  void FindNextUserEntry(bool skipping);
+  inline void FindNextUserEntry(bool skipping);
+  void FindNextUserEntryInternal(bool skipping);
   void FindPrevUserEntry();
   bool ParseKey(ParsedInternalKey* key);
   void MergeValuesNewToOld();
@@ -191,7 +192,15 @@ void DBIter::Next() {
 //
 // NOTE: In between, saved_key_ can point to a user key that has
 //       a delete marker
-void DBIter::FindNextUserEntry(bool skipping) {
+inline void DBIter::FindNextUserEntry(bool skipping) {
+  StopWatchNano timer(env_, false);
+  StartPerfTimer(&timer);
+  FindNextUserEntryInternal(skipping);
+  BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+void DBIter::FindNextUserEntryInternal(bool skipping) {
   // Loop until we hit an acceptable entry to yield
   assert(iter_->Valid());
   assert(direction_ == kForward);
@@ -429,7 +438,10 @@ void DBIter::Seek(const Slice& target) {
   saved_key_.clear();
   AppendInternalKey(
       &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+  StopWatchNano internal_seek_timer(env_, false);
+  StartPerfTimer(&internal_seek_timer);
   iter_->Seek(saved_key_);
+  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
   if (iter_->Valid()) {
     direction_ = kForward;
     ClearSavedValue();
@@ -442,7 +454,10 @@ void DBIter::Seek(const Slice& target) {
 void DBIter::SeekToFirst() {
   direction_ = kForward;
   ClearSavedValue();
+  StopWatchNano internal_seek_timer(env_, false);
+  StartPerfTimer(&internal_seek_timer);
   iter_->SeekToFirst();
+  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
   if (iter_->Valid()) {
     FindNextUserEntry(false /* not skipping */);
   } else {
@@ -461,7 +476,10 @@ void DBIter::SeekToLast() {
 
   direction_ = kReverse;
   ClearSavedValue();
+  StopWatchNano internal_seek_timer(env_, false);
+  StartPerfTimer(&internal_seek_timer);
   iter_->SeekToLast();
+  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
   FindPrevUserEntry();
 }
 
diff --git a/db/db_test.cc b/db/db_test.cc
index aca07bcff..ed7425521 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -22,6 +22,7 @@
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/perf_context.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
@@ -1215,7 +1216,13 @@ TEST(DBTest, IterMulti) {
     ASSERT_EQ(IterStatus(iter), "a->va");
     iter->Seek("ax");
     ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    SetPerfLevel(kEnableTime);
+    perf_context.Reset();
     iter->Seek("b");
+    ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0);
+    ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
+    SetPerfLevel(kDisable);
     ASSERT_EQ(IterStatus(iter), "b->vb");
     iter->Seek("z");
     ASSERT_EQ(IterStatus(iter), "(invalid)");
@@ -1230,7 +1237,12 @@ TEST(DBTest, IterMulti) {
     // Switch from forward to reverse
     iter->SeekToFirst();
     iter->Next();
+    SetPerfLevel(kEnableTime);
+    perf_context.Reset();
     iter->Next();
+    ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time);
+    ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
+    SetPerfLevel(kDisable);
     iter->Prev();
     ASSERT_EQ(IterStatus(iter), "b->vb");
 
@@ -1590,22 +1602,42 @@ TEST(DBTest, NumImmutableMemTable) {
 
     std::string big_value(1000000, 'x');
     std::string num;
+    SetPerfLevel(kEnableTime);;
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "0");
+    perf_context.Reset();
+    Get("k1");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "1");
+    perf_context.Reset();
+    Get("k1");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get("k2");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "2");
+    perf_context.Reset();
+    Get("k2");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get("k3");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get("k1");
+    ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
 
     dbfull()->Flush(FlushOptions());
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "0");
+    SetPerfLevel(kDisable);
   } while (ChangeCompactOptions());
 }
 
@@ -1614,11 +1646,16 @@ TEST(DBTest, FLUSH) {
     Options options = CurrentOptions();
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
+    SetPerfLevel(kEnableTime);;
     ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
     // this will now also flush the last 2 writes
     dbfull()->Flush(FlushOptions());
     ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
 
+    perf_context.Reset();
+    Get("foo");
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
+
     Reopen();
     ASSERT_EQ("v1", Get("foo"));
     ASSERT_EQ("v1", Get("bar"));
@@ -1630,7 +1667,9 @@ TEST(DBTest, FLUSH) {
 
     Reopen();
     ASSERT_EQ("v2", Get("bar"));
+    perf_context.Reset();
     ASSERT_EQ("v2", Get("foo"));
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
 
     writeOpt.disableWAL = false;
     ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
@@ -1642,6 +1681,8 @@ TEST(DBTest, FLUSH) {
     // has WAL enabled.
     ASSERT_EQ("v3", Get("foo"));
     ASSERT_EQ("v3", Get("bar"));
+
+    SetPerfLevel(kDisable);
   } while (ChangeCompactOptions());
 }
 
diff --git a/db/memtable.cc b/db/memtable.cc
index 44eb160e7..dce0c382f 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -19,6 +19,8 @@
 #include "util/coding.h"
 #include "util/mutexlock.h"
 #include "util/murmurhash.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
 
 namespace std {
 template <>
@@ -161,6 +163,9 @@ void MemTable::Add(SequenceNumber s, ValueType type,
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    std::deque<std::string>* operands, const Options& options) {
+  StopWatchNano memtable_get_timer(options.env, false);
+  StartPerfTimer(&memtable_get_timer);
+
   Slice memkey = key.memtable_key();
   std::shared_ptr<MemTableRep::Iterator> iter(
     table_->GetIterator(key.user_key()));
@@ -174,7 +179,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   auto logger = options.info_log;
   std::string merge_result;
 
-  for (; iter->Valid(); iter->Next()) {
+  bool found_final_value = false;
+  for (; !found_final_value && iter->Valid(); iter->Next()) {
     // entry format is:
     //    klength  varint32
     //    userkey  char[klength-8]
@@ -211,7 +217,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           if (options.inplace_update_support) {
             GetLock(key.user_key())->Unlock();
           }
-          return true;
+          found_final_value = true;
+          break;
         }
         case kTypeDeletion: {
           if (merge_in_progress) {
@@ -225,7 +232,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           } else {
             *s = Status::NotFound(Slice());
           }
-          return true;
+          found_final_value = true;
+          break;
         }
         case kTypeMerge: {
           Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
@@ -259,10 +267,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
 
   // No change to value, since we have not yet found a Put/Delete
 
-  if (merge_in_progress) {
+  if (!found_final_value && merge_in_progress) {
     *s = Status::MergeInProgress("");
   }
-  return false;
+  BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
+  BumpPerfCount(&perf_context.get_from_memtable_count);
+  return found_final_value;
 }
 
 bool MemTable::Update(SequenceNumber seq, ValueType type,
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 05416748d..2a6e6b7e4 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -174,6 +174,13 @@ void ProfileKeyComparison() {
 
   HistogramImpl hist_put;
   HistogramImpl hist_get;
+  HistogramImpl hist_get_snapshot;
+  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_post_process;
+  HistogramImpl hist_num_memtable_checked;
+  HistogramImpl hist_write_pre_post;
+  HistogramImpl hist_write_wal_time;
+  HistogramImpl hist_write_memtable_time;
 
   std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
 
@@ -192,16 +199,37 @@ void ProfileKeyComparison() {
 
     perf_context.Reset();
     db->Put(write_options, key, value);
+    hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
+    hist_write_wal_time.Add(perf_context.write_wal_time);
+    hist_write_memtable_time.Add(perf_context.write_memtable_time);
     hist_put.Add(perf_context.user_key_comparison_count);
 
     perf_context.Reset();
     db->Get(read_options, key, &value);
+    hist_get_snapshot.Add(perf_context.get_snapshot_time);
+    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_get_post_process.Add(perf_context.get_post_process_time);
     hist_get.Add(perf_context.user_key_comparison_count);
   }
 
   std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
             << "Get uesr key comparison: \n" << hist_get.ToString();
-
+  std::cout << "Put(): Pre and Post Process Time: \n"
+            << hist_write_pre_post.ToString()
+            << " Writing WAL time: \n"
+            << hist_write_wal_time.ToString() << "\n"
+            << " Writing Mem Table time: \n"
+            << hist_write_memtable_time.ToString() << "\n";
+
+  std::cout << "Get(): Time to get snapshot: \n"
+            << hist_get_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_get_memtable.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n"
+            << hist_get_post_process.ToString() << "\n";
 }
 
 TEST(PerfContextTest, KeyComparisonCount) {
@@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
     db->Put(write_options, key, value);
     auto put_time = timer.ElapsedNanos();
     hist_put_time.Add(put_time);
-    hist_wal_time.Add(perf_context.wal_write_time);
-    hist_time_diff.Add(put_time - perf_context.wal_write_time);
+    hist_wal_time.Add(perf_context.write_wal_time);
+    hist_time_diff.Add(put_time - perf_context.write_wal_time);
   }
 
   std::cout << "Put time:\n" << hist_put_time.ToString()
diff --git a/db/version_set.cc b/db/version_set.cc
index d554657b4..349abfbaa 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2091,7 +2091,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
     }
   }
   assert(num <= space);
-  Iterator* result = NewMergingIterator(&icmp_, list, num);
+  Iterator* result = NewMergingIterator(env_, &icmp_, list, num);
   delete[] list;
   return result;
 }
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
index 9e900e050..551ca8fe6 100644
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -38,7 +38,27 @@ struct PerfContext {
   uint64_t internal_key_skipped_count;
   // total number of deletes skipped over during iteration
   uint64_t internal_delete_skipped_count;
-  uint64_t wal_write_time;            // total time spent on writing to WAL
+
+  uint64_t get_snapshot_time;          // total time spent on getting snapshot
+  uint64_t get_from_memtable_time;     // total time spent on querying memtables
+  uint64_t get_from_memtable_count;    // number of mem tables queried
+  // total time spent after Get() finds a key
+  uint64_t get_post_process_time;
+  uint64_t get_from_output_files_time; // total time reading from output files
+  // total time spent on seeking child iters
+  uint64_t seek_child_seek_time;
+  // number of seek issued in child iterators
+  uint64_t seek_child_seek_count;
+  uint64_t seek_min_heap_time;         // total time spent on the merge heap
+  // total time spent on seeking the internal entries
+  uint64_t seek_internal_seek_time;
+  // total time spent on iterating internal entries to find the next user entry
+  uint64_t find_next_user_entry_time;
+  // total time spent on pre or post processing when writing a record
+  uint64_t write_pre_and_post_process_time;
+  uint64_t write_wal_time;            // total time spent on writing to WAL
+  // total time spent on writing to mem tables
+  uint64_t write_memtable_time;
 };
 
 extern __thread PerfContext perf_context;
diff --git a/table/merger.cc b/table/merger.cc
index 89faf402b..1aed00cc5 100644
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -11,8 +11,11 @@
 
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
 #include "table/iter_heap.h"
 #include "table/iterator_wrapper.h"
+#include "util/stop_watch.h"
+#include "util/perf_context_imp.h"
 
 #include <vector>
 
@@ -22,11 +25,13 @@ namespace {
 
 class MergingIterator : public Iterator {
  public:
-  MergingIterator(const Comparator* comparator, Iterator** children, int n)
+  MergingIterator(Env* const env, const Comparator* comparator,
+                  Iterator** children, int n)
       : comparator_(comparator),
         children_(n),
         current_(nullptr),
         use_heap_(true),
+        env_(env),
         direction_(kForward),
         maxHeap_(NewMaxIterHeap(comparator_)),
         minHeap_ (NewMinIterHeap(comparator_)) {
@@ -74,8 +79,14 @@ class MergingIterator : public Iterator {
     // Invalidate the heap.
     use_heap_ = false;
     IteratorWrapper* first_child = nullptr;
+    StopWatchNano child_seek_timer(env_, false);
+    StopWatchNano min_heap_timer(env_, false);
     for (auto& child : children_) {
+      StartPerfTimer(&child_seek_timer);
       child.Seek(target);
+      BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
+      BumpPerfCount(&perf_context.seek_child_seek_count);
+
       if (child.Valid()) {
         // This child has valid key
         if (!use_heap_) {
@@ -86,24 +97,31 @@ class MergingIterator : public Iterator {
           } else {
             // We have more than one children with valid keys. Initialize
             // the heap and put the first child into the heap.
+            StartPerfTimer(&min_heap_timer);
             ClearHeaps();
+            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+            StartPerfTimer(&min_heap_timer);
             minHeap_.push(first_child);
+            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
           }
         }
         if (use_heap_) {
+          StartPerfTimer(&min_heap_timer);
           minHeap_.push(&child);
+          BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
         }
       }
     }
     if (use_heap_) {
       // If heap is valid, need to put the smallest key to curent_.
+      StartPerfTimer(&min_heap_timer);
       FindSmallest();
+      BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
     } else {
       // The heap is not valid, then the current_ iterator is the first
       // one, or null if there is no first child.
       current_ = first_child;
     }
-    direction_ = kForward;
   }
 
   virtual void Next() {
@@ -211,6 +229,7 @@ class MergingIterator : public Iterator {
   // contain valid rows. If it is false, only current_ can possibly contain
   // valid rows.
   bool use_heap_;
+  Env* const env_;
   // Which direction is the iterator moving?
   enum Direction {
     kForward,
@@ -250,14 +269,15 @@ void MergingIterator::ClearHeaps() {
 }
 }  // namespace
 
-Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
+Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
+                             Iterator** list, int n) {
   assert(n >= 0);
   if (n == 0) {
     return NewEmptyIterator();
   } else if (n == 1) {
     return list[0];
   } else {
-    return new MergingIterator(cmp, list, n);
+    return new MergingIterator(env, cmp, list, n);
   }
 }
 
diff --git a/table/merger.h b/table/merger.h
index dbc1f69eb..74f46ac9b 100644
--- a/table/merger.h
+++ b/table/merger.h
@@ -13,6 +13,7 @@ namespace rocksdb {
 
 class Comparator;
 class Iterator;
+class Env;
 
 // Return an iterator that provided the union of the data in
 // children[0,n-1].  Takes ownership of the child iterators and
@@ -23,6 +24,6 @@ class Iterator;
 //
 // REQUIRES: n >= 0
 extern Iterator* NewMergingIterator(
-    const Comparator* comparator, Iterator** children, int n);
+    Env* const env, const Comparator* comparator, Iterator** children, int n);
 
 }  // namespace rocksdb
diff --git a/util/perf_context.cc b/util/perf_context.cc
index 1e8ddfb5e..6833f6836 100644
--- a/util/perf_context.cc
+++ b/util/perf_context.cc
@@ -22,7 +22,20 @@ void PerfContext::Reset() {
   block_decompress_time = 0;
   internal_key_skipped_count = 0;
   internal_delete_skipped_count = 0;
-  wal_write_time = 0;
+  write_wal_time = 0;
+
+  get_snapshot_time = 0;
+  get_from_memtable_time = 0;
+  get_from_memtable_count = 0;
+  get_post_process_time = 0;
+  get_from_output_files_time = 0;
+  seek_child_seek_time = 0;
+  seek_child_seek_count = 0;
+  seek_min_heap_time = 0;
+  seek_internal_seek_time = 0;
+  find_next_user_entry_time = 0;
+  write_pre_and_post_process_time = 0;
+  write_memtable_time = 0;
 }
 
 __thread PerfContext perf_context;

From 718488abc5ff8c29f546922cc68d0c1770b640b8 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 21 Nov 2013 15:13:45 -0800
Subject: [PATCH 08/70] Add BloomFilter to PlainTableIterator::Seek()

Summary:
This patch adds a simple bloom filter in PlainTableIterator::Seek()

Test Plan: N/A

Reviewers:

CC:

Task ID: #

Blame Rev:
---
 db/plain_table_db_test.cc             |  2 +-
 include/rocksdb/plain_table_factory.h | 13 ++++++-
 table/plain_table_factory.cc          |  6 +--
 table/plain_table_reader.cc           | 53 +++++++++++++++++++++++----
 table/plain_table_reader.h            | 11 +++++-
 table/table_reader_bench.cc           |  2 +-
 6 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 3697b4c45..0baf56ecd 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -59,7 +59,7 @@ public:
   // Return the current option configuration.
   Options CurrentOptions() {
     Options options;
-    options.table_factory.reset(new PlainTableFactory(16, 8));
+    options.table_factory.reset(new PlainTableFactory(16, 8, 2, 0.8));
     options.allow_mmap_reads = true;
     return options;
   }
diff --git a/include/rocksdb/plain_table_factory.h b/include/rocksdb/plain_table_factory.h
index f8a0cb9a9..0d843e2c2 100644
--- a/include/rocksdb/plain_table_factory.h
+++ b/include/rocksdb/plain_table_factory.h
@@ -47,8 +47,15 @@ class PlainTableFactory: public TableFactory {
 public:
   ~PlainTableFactory() {
   }
-  PlainTableFactory(int user_key_size, int key_prefix_len) :
-      user_key_size_(user_key_size), key_prefix_len_(key_prefix_len) {
+  // user_key_size is the length of the user key. key_prefix_len is the
+  // length of the prefix used for im-memory indexes. bloom_num_bits is
+  // number of bits is used for bloom filer per key. hash_table_ratio is
+  // the desired ultilization of the hash table used for prefix hashing.
+  // hash_table_ratio = number of prefixes / #buckets in the hash table
+  PlainTableFactory(int user_key_size, int key_prefix_len,
+                    int bloom_num_bits = 0, double hash_table_ratio = 0.75) :
+      user_key_size_(user_key_size), key_prefix_len_(key_prefix_len),
+      bloom_num_bits_(bloom_num_bits), hash_table_ratio_(hash_table_ratio) {
   }
   const char* Name() const override {
     return "PlainTable";
@@ -64,6 +71,8 @@ public:
 private:
   int user_key_size_;
   int key_prefix_len_;
+  int bloom_num_bits_;
+  double hash_table_ratio_;
 };
 
 }  // namespace rocksdb
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index 10393501d..08e75c4ec 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -19,13 +19,13 @@ Status PlainTableFactory::GetTableReader(const Options& options,
                                          unique_ptr<TableReader>* table)
      const {
   return PlainTableReader::Open(options, soptions, std::move(file), file_size,
-                                  table, user_key_size_, key_prefix_len_);
+                                  table, user_key_size_, key_prefix_len_,
+                                  bloom_num_bits_, hash_table_ratio_);
 }
 
 TableBuilder* PlainTableFactory::GetTableBuilder(
     const Options& options, WritableFile* file,
     CompressionType compression_type) const {
-  return new PlainTableBuilder(options, file, user_key_size_,
-                                 key_prefix_len_);
+  return new PlainTableBuilder(options, file, user_key_size_, key_prefix_len_);
 }
 }  // namespace rocksdb
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 5577c4eca..657dc031e 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -40,9 +40,16 @@ namespace rocksdb {
 
 PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
                                    uint64_t file_size, int user_key_size,
-                                   int key_prefix_len) :
-    soptions_(storage_options), file_size_(file_size),
-    user_key_size_(user_key_size), key_prefix_len_(key_prefix_len) {
+                                   int key_prefix_len, int bloom_bits_per_key,
+                                   double hash_table_ratio) :
+    hash_table_size_(0), soptions_(storage_options), file_size_(file_size),
+    user_key_size_(user_key_size), key_prefix_len_(key_prefix_len),
+    hash_table_ratio_(hash_table_ratio) {
+  if (bloom_bits_per_key > 0) {
+    filter_policy_ = NewBloomFilterPolicy(bloom_bits_per_key);
+  } else {
+    filter_policy_ = nullptr;
+  }
   hash_table_ = nullptr;
 }
 
@@ -50,6 +57,9 @@ PlainTableReader::~PlainTableReader() {
   if (hash_table_ != nullptr) {
     delete[] hash_table_;
   }
+  if (filter_policy_ != nullptr) {
+    delete filter_policy_;
+  }
 }
 
 Status PlainTableReader::Open(const Options& options,
@@ -58,12 +68,16 @@ Status PlainTableReader::Open(const Options& options,
                               uint64_t file_size,
                               unique_ptr<TableReader>* table_reader,
                               const int user_key_size,
-                              const int key_prefix_len) {
+                              const int key_prefix_len,
+                              const int bloom_num_bits,
+                              double hash_table_ratio) {
   assert(options.allow_mmap_reads);
 
   PlainTableReader* t = new PlainTableReader(soptions, file_size,
                                              user_key_size,
-                                             key_prefix_len);
+                                             key_prefix_len,
+                                             bloom_num_bits,
+                                             hash_table_ratio);
   t->file_ = std::move(file);
   t->options_ = options;
   Status s = t->PopulateIndex(file_size);
@@ -146,14 +160,25 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
     delete[] hash_table_;
   }
   // Make the hash table 3/5 full
-  hash_table_size_ = tmp_index.size() * 1.66;
+  std::vector<Slice> filter_entries(0); // for creating bloom filter;
+  if (filter_policy_ != nullptr) {
+    filter_entries.resize(tmp_index.size());
+  }
+  double hash_table_size_multipier =
+      (hash_table_ratio_ < 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
+  hash_table_size_ = tmp_index.size() * hash_table_size_multipier + 1;
   hash_table_ = new char[GetHashTableRecordLen() * hash_table_size_];
   for (int i = 0; i < hash_table_size_; i++) {
     memcpy(GetHashTableBucketPtr(i) + key_prefix_len_, &file_size_,
            kOffsetLen);
   }
 
+  size_t count = 0;
   for (auto it = tmp_index.begin(); it != tmp_index.end(); ++it) {
+    if (filter_policy_ != nullptr) {
+      filter_entries[count++] = it->first;
+    }
+
     int bucket = GetHashTableBucket(it->first);
     uint64_t* hash_value;
     while (true) {
@@ -168,6 +193,10 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
     memcpy(bucket_ptr, it->first.data(), key_prefix_len_);
     memcpy(bucket_ptr + key_prefix_len_, &it->second, kOffsetLen);
   }
+  if (filter_policy_ != nullptr) {
+    filter_policy_->CreateFilter(&filter_entries[0], count, &filter_str_);
+    filter_slice_ = Slice(filter_str_.data(), filter_str_.size());
+  }
 
   Log(options_.info_log, "Number of prefixes: %d, suffix_map length %ld",
       hash_table_size_, sub_index_.length());
@@ -187,7 +216,6 @@ inline void PlainTableReader::GetHashValue(int bucket, uint64_t** ret_value) {
 
 Status PlainTableReader::GetOffset(const Slice& target, uint64_t* offset) {
   Status s;
-
   int bucket = GetHashTableBucket(target);
   uint64_t* found_value;
   Slice hash_key;
@@ -248,6 +276,12 @@ Status PlainTableReader::GetOffset(const Slice& target, uint64_t* offset) {
   return s;
 }
 
+bool PlainTableReader::MayHavePrefix(const Slice& target_prefix) {
+  return filter_policy_ == nullptr
+      || filter_policy_->KeyMayMatch(target_prefix, filter_slice_);
+}
+
+
 uint64_t PlainTableReader::Next(uint64_t offset, Slice* key, Slice* value,
                                 Slice* tmp_slice) {
   if (offset >= file_size_) {
@@ -321,6 +355,11 @@ void PlainTableIterator::SeekToLast() {
 }
 
 void PlainTableIterator::Seek(const Slice& target) {
+  if (!table_->MayHavePrefix(Slice(target.data(), table_->key_prefix_len_))) {
+    offset_ = next_offset_ = table_->file_size_;
+    return;
+  }
+
   Status s = table_->GetOffset(target, &next_offset_);
   if (!s.ok()) {
     status_ = s;
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 44b545833..d9ac34326 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -57,7 +57,8 @@ public:
   static Status Open(const Options& options, const EnvOptions& soptions,
                      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
                      unique_ptr<TableReader>* table, const int user_key_size,
-                     const int key_prefix_len);
+                     const int key_prefix_len, const int bloom_num_bits,
+                     double hash_table_ratio);
 
   bool PrefixMayMatch(const Slice& internal_prefix);
 
@@ -79,7 +80,8 @@ public:
   }
 
   PlainTableReader(const EnvOptions& storage_options, uint64_t file_size,
-                   int user_key_size, int key_prefix_len);
+                   int user_key_size, int key_prefix_len, int bloom_num_bits,
+                   double hash_table_ratio);
   ~PlainTableReader();
 
 private:
@@ -95,6 +97,10 @@ private:
   uint64_t file_size_;
   const size_t user_key_size_;
   const size_t key_prefix_len_;
+  const double hash_table_ratio_;
+  const FilterPolicy* filter_policy_;
+  std::string filter_str_;
+  Slice filter_slice_;
 
   TableProperties tbl_props;
 
@@ -123,6 +129,7 @@ private:
   Status PopulateIndex(uint64_t file_size);
   uint64_t Next(uint64_t offset, Slice* key, Slice* value, Slice* tmp_slice);
   Status GetOffset(const Slice& target, uint64_t* offset);
+  bool MayHavePrefix(const Slice& target_prefix);
 
   // No copying allowed
   explicit PlainTableReader(const TableReader&) = delete;
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 8d3fd2412..a8a344434 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -242,7 +242,7 @@ int main(int argc, char** argv) {
   if (FLAGS_plain_table) {
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;
-    tf = new rocksdb::PlainTableFactory(16, FLAGS_prefix_len);
+    tf = new rocksdb::PlainTableFactory(16, FLAGS_prefix_len, FLAGS_prefix_len);
   } else {
     tf = new rocksdb::BlockBasedTableFactory();
   }

From dfa1460d88535bf09aaeffdb6d2bdca513b1dbc2 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 21 Nov 2013 23:33:45 -0800
Subject: [PATCH 09/70] [For Performance Branch] Bloom filter in
 PlainTableIterator::Seek() - Update 1

Summary:
Address @haobo's comments in D14277

Test Plan: ./indexed_table_db_test

Reviewers: haobo

CC:

Task ID: #

Blame Rev:
---
 include/rocksdb/plain_table_factory.h |  4 ++--
 table/plain_table_reader.cc           | 10 +++++-----
 table/table_reader_bench.cc           |  4 +++-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/rocksdb/plain_table_factory.h b/include/rocksdb/plain_table_factory.h
index 0d843e2c2..3d26c6e4e 100644
--- a/include/rocksdb/plain_table_factory.h
+++ b/include/rocksdb/plain_table_factory.h
@@ -48,8 +48,8 @@ public:
   ~PlainTableFactory() {
   }
   // user_key_size is the length of the user key. key_prefix_len is the
-  // length of the prefix used for im-memory indexes. bloom_num_bits is
-  // number of bits is used for bloom filer per key. hash_table_ratio is
+  // length of the prefix used for in-memory indexes. bloom_num_bits is
+  // number of bits used for bloom filer per key. hash_table_ratio is
   // the desired ultilization of the hash table used for prefix hashing.
   // hash_table_ratio = number of prefixes / #buckets in the hash table
   PlainTableFactory(int user_key_size, int key_prefix_len,
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 657dc031e..9d0283b22 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -162,10 +162,10 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
   // Make the hash table 3/5 full
   std::vector<Slice> filter_entries(0); // for creating bloom filter;
   if (filter_policy_ != nullptr) {
-    filter_entries.resize(tmp_index.size());
+    filter_entries.reserve(tmp_index.size());
   }
   double hash_table_size_multipier =
-      (hash_table_ratio_ < 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
+      (hash_table_ratio_ > 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
   hash_table_size_ = tmp_index.size() * hash_table_size_multipier + 1;
   hash_table_ = new char[GetHashTableRecordLen() * hash_table_size_];
   for (int i = 0; i < hash_table_size_; i++) {
@@ -173,10 +173,9 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
            kOffsetLen);
   }
 
-  size_t count = 0;
   for (auto it = tmp_index.begin(); it != tmp_index.end(); ++it) {
     if (filter_policy_ != nullptr) {
-      filter_entries[count++] = it->first;
+      filter_entries.push_back(it->first);
     }
 
     int bucket = GetHashTableBucket(it->first);
@@ -194,7 +193,8 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
     memcpy(bucket_ptr + key_prefix_len_, &it->second, kOffsetLen);
   }
   if (filter_policy_ != nullptr) {
-    filter_policy_->CreateFilter(&filter_entries[0], count, &filter_str_);
+    filter_policy_->CreateFilter(&filter_entries[0], filter_entries.size(),
+                                 &filter_str_);
     filter_slice_ = Slice(filter_str_.data(), filter_str_.size());
   }
 
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index a8a344434..7e7e6b7da 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -242,7 +242,9 @@ int main(int argc, char** argv) {
   if (FLAGS_plain_table) {
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;
-    tf = new rocksdb::PlainTableFactory(16, FLAGS_prefix_len, FLAGS_prefix_len);
+    tf = new rocksdb::PlainTableFactory(16, FLAGS_prefix_len,
+                                        (FLAGS_prefix_len == 16) ? 0 : 8,
+                                        0.75);
   } else {
     tf = new rocksdb::BlockBasedTableFactory();
   }

From 8aac46d6864d56b1ff2baa4a7b01d2f2e72f28f9 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 26 Nov 2013 12:23:02 -0800
Subject: [PATCH 10/70] [RocksDB Performance Branch] Fix a regression bug of
 munmap

Summary:
Fix a stupid bug I just introduced in b59d4d5a5051263b4bfcef00913219ffe4654e42, which I didn't even mean to include.
GCC might remove the munmap.

Test Plan: Run it and make sure munmap succeeds

Reviewers: haobo, kailiu

Reviewed By: kailiu

CC: dhruba, reconnect.grayhat, leveldb

Differential Revision: https://reviews.facebook.net/D14361
---
 util/env_posix.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index c6995b30c..1643d2927 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -307,7 +307,11 @@ class PosixMmapReadableFile: public RandomAccessFile {
     assert(options.use_os_buffer);
   }
   virtual ~PosixMmapReadableFile() {
-    assert(munmap(mmapped_region_, length_) == 0);
+    int ret = munmap(mmapped_region_, length_);
+    if (ret != 0) {
+      fprintf(stdout, "failed to munmap %p length %zu \n",
+              mmapped_region_, length_);
+    }
   }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,

From 4e6463ea44e626b4cfc77795da125e000672b6b3 Mon Sep 17 00:00:00 2001
From: Haobo Xu <haobo@fb.com>
Date: Fri, 22 Nov 2013 13:31:00 -0800
Subject: [PATCH 11/70] [RocksDB][Performance Branch] Make height and branching
 factor configurable for skiplist implementation

Summary: As title. Especially, HashSkipListRepFactory will be able to specify a relatively small height, to reduce the memory overhead of one skiplist per bucket.

Test Plan: make check and test it on leaf4

Reviewers: dhruba, sdong, kailiu

CC: reconnect.grayhat, leveldb

Differential Revision: https://reviews.facebook.net/D14307
---
 db/prefix_test.cc             |  6 ++++--
 db/skiplist.h                 | 33 +++++++++++++++++++----------
 include/rocksdb/memtablerep.h | 11 ++++++++--
 util/hash_skiplist_rep.cc     | 39 ++++++++++++++++++++++++++---------
 4 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 6c7fc1697..4b15e63e3 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -22,6 +22,7 @@ DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
 DEFINE_int64(write_buffer_size, 1000000000, "");
 DEFINE_int64(max_write_buffer_number, 8, "");
 DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
+DEFINE_int32(skiplist_height, 4, "");
 
 // Path to the database on file system
 const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
@@ -111,7 +112,8 @@ class PrefixTest {
       options.prefix_extractor = prefix_extractor;
       if (FLAGS_use_nolock_version) {
         options.memtable_factory.reset(NewHashSkipListRepFactory(
-            prefix_extractor, FLAGS_bucket_count));
+                                         prefix_extractor, FLAGS_bucket_count,
+                                         FLAGS_skiplist_height));
       } else {
         options.memtable_factory =
           std::make_shared<rocksdb::PrefixHashRepFactory>(
@@ -152,7 +154,7 @@ TEST(PrefixTest, DynamicPrefixIterator) {
       TestKey test_key(prefix, sorted);
 
       Slice key = TestKeyToSlice(test_key);
-      std::string value = "v" + std::to_string(sorted);
+      std::string value(40, 0);
 
       ASSERT_OK(db->Put(write_options, key, value));
     }
diff --git a/db/skiplist.h b/db/skiplist.h
index 06a35d911..54b4f7446 100644
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -47,7 +47,8 @@ class SkipList {
   // Create a new SkipList object that will use "cmp" for comparing keys,
   // and will allocate memory using "*arena".  Objects allocated in the arena
   // must remain allocated for the lifetime of the skiplist object.
-  explicit SkipList(Comparator cmp, Arena* arena);
+  explicit SkipList(Comparator cmp, Arena* arena,
+                    int32_t max_height = 12, int32_t branching_factor = 4);
 
   // Insert key into the list.
   // REQUIRES: nothing that compares equal to key is currently in the list.
@@ -101,7 +102,8 @@ class SkipList {
   };
 
  private:
-  enum { kMaxHeight = 12 };
+  const int32_t kMaxHeight_;
+  const int32_t kBranching_;
 
   // Immutable after construction
   Comparator const compare_;
@@ -114,8 +116,8 @@ class SkipList {
   port::AtomicPointer max_height_;   // Height of the entire list
 
   // Used for optimizing sequential insert patterns
-  Node* prev_[kMaxHeight];
-  int   prev_height_;
+  Node** prev_;
+  int32_t prev_height_;
 
   inline int GetMaxHeight() const {
     return static_cast<int>(
@@ -257,13 +259,12 @@ inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
 template<typename Key, class Comparator>
 int SkipList<Key,Comparator>::RandomHeight() {
   // Increase height with probability 1 in kBranching
-  static const unsigned int kBranching = 4;
   int height = 1;
-  while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+  while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
     height++;
   }
   assert(height > 0);
-  assert(height <= kMaxHeight);
+  assert(height <= kMaxHeight_);
   return height;
 }
 
@@ -353,14 +354,24 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
 }
 
 template<typename Key, class Comparator>
-SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
-    : compare_(cmp),
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena,
+                                   int32_t max_height,
+                                   int32_t branching_factor)
+    : kMaxHeight_(max_height),
+      kBranching_(branching_factor),
+      compare_(cmp),
       arena_(arena),
-      head_(NewNode(0 /* any key will do */, kMaxHeight)),
+      head_(NewNode(0 /* any key will do */, max_height)),
       max_height_(reinterpret_cast<void*>(1)),
       prev_height_(1),
       rnd_(0xdeadbeef) {
-  for (int i = 0; i < kMaxHeight; i++) {
+  assert(kMaxHeight_ > 0);
+  assert(kBranching_ > 0);
+  // Allocate the prev_ Node* array, directly from the passed-in arena.
+  // prev_ does not need to be freed, as its life cycle is tied up with
+  // the arena as a whole.
+  prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
+  for (int i = 0; i < kMaxHeight_; i++) {
     head_->SetNext(i, nullptr);
     prev_[i] = head_;
   }
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 83b15d256..53a7f5064 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -267,9 +267,16 @@ public:
 
 // The same as TransformRepFactory except it doesn't use locks.
 // Experimental, will replace TransformRepFactory once we are sure
-// it performs better
+// it performs better. It contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+//                            link lists in the skiplist
 extern MemTableRepFactory* NewHashSkipListRepFactory(
-    const SliceTransform* transform, size_t bucket_count = 1000000);
+  const SliceTransform* transform, size_t bucket_count = 1000000,
+  int32_t skiplist_height = 4, int32_t skiplist_branching_factor = 4
+);
 
 }
 
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index 290ce9d63..bcc459f66 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -20,7 +20,8 @@ namespace {
 class HashSkipListRep : public MemTableRep {
  public:
   HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform* transform, size_t bucket_size);
+                  const SliceTransform* transform, size_t bucket_size,
+                  int32_t skiplist_height, int32_t skiplist_branching_factor);
 
   virtual void Insert(const char* key) override;
 
@@ -47,6 +48,9 @@ class HashSkipListRep : public MemTableRep {
 
   size_t bucket_size_;
 
+  const int32_t skiplist_height_;
+  const int32_t skiplist_branching_factor_;
+
   // Maps slices (which are transformed user keys) to buckets of keys sharing
   // the same transform.
   port::AtomicPointer* buckets_;
@@ -215,8 +219,12 @@ class HashSkipListRep : public MemTableRep {
 };
 
 HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
-    Arena* arena, const SliceTransform* transform, size_t bucket_size)
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size, int32_t skiplist_height,
+                                 int32_t skiplist_branching_factor)
   : bucket_size_(bucket_size),
+    skiplist_height_(skiplist_height),
+    skiplist_branching_factor_(skiplist_branching_factor),
     transform_(transform),
     compare_(compare),
     arena_(arena),
@@ -239,7 +247,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
   auto bucket = GetBucket(hash);
   if (bucket == nullptr) {
     auto addr = arena_->AllocateAligned(sizeof(Bucket));
-    bucket = new (addr) Bucket(compare_, arena_);
+    bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
+                               skiplist_branching_factor_);
     buckets_[hash].Release_Store(static_cast<void*>(bucket));
   }
   return bucket;
@@ -302,17 +311,23 @@ std::shared_ptr<MemTableRep::Iterator>
 
 class HashSkipListRepFactory : public MemTableRepFactory {
  public:
-  explicit HashSkipListRepFactory(const SliceTransform* transform,
-      size_t bucket_count = 1000000)
-    : transform_(transform),
-      bucket_count_(bucket_count) { }
+  explicit HashSkipListRepFactory(
+    const SliceTransform* transform,
+    size_t bucket_count,
+    int32_t skiplist_height,
+    int32_t skiplist_branching_factor)
+      : transform_(transform),
+        bucket_count_(bucket_count),
+        skiplist_height_(skiplist_height),
+        skiplist_branching_factor_(skiplist_branching_factor) { }
 
   virtual ~HashSkipListRepFactory() { delete transform_; }
 
   virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
       MemTableRep::KeyComparator& compare, Arena* arena) override {
     return std::make_shared<HashSkipListRep>(compare, arena, transform_,
-        bucket_count_);
+                                             bucket_count_, skiplist_height_,
+                                             skiplist_branching_factor_);
   }
 
   virtual const char* Name() const override {
@@ -324,11 +339,15 @@ class HashSkipListRepFactory : public MemTableRepFactory {
  private:
   const SliceTransform* transform_;
   const size_t bucket_count_;
+  const int32_t skiplist_height_;
+  const int32_t skiplist_branching_factor_;
 };
 
 MemTableRepFactory* NewHashSkipListRepFactory(
-    const SliceTransform* transform, size_t bucket_count) {
-  return new HashSkipListRepFactory(transform, bucket_count);
+    const SliceTransform* transform, size_t bucket_count,
+    int32_t skiplist_height, int32_t skiplist_branching_factor) {
+  return new HashSkipListRepFactory(transform, bucket_count,
+                                    skiplist_height, skiplist_branching_factor);
 }
 
 } // namespace rocksdb

From 8c424456fcdaa700008fde8ff167ec40b4aaabb8 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Mon, 2 Dec 2013 20:05:16 -0800
Subject: [PATCH 12/70] Make the default compilation debug-friendly

---
 Makefile | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 6fa2864eb..62f180846 100644
--- a/Makefile
+++ b/Makefile
@@ -6,11 +6,7 @@
 INSTALL_PATH ?= $(CURDIR)
 
 #-----------------------------------------------
-# Uncomment exactly one of the lines labelled (A), (B), and (C) below
-# to switch between compilation modes.
-
-# OPT ?= -DNDEBUG     # (A) Production use (optimized mode)
-OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+OPT += -fno-omit-frame-pointer -momit-leaf-frame-pointer
 #-----------------------------------------------
 
 # detect what platform we're building on
@@ -139,7 +135,7 @@ all: $(LIBRARY) $(PROGRAMS)
 
 release:
 	$(MAKE) clean
-	OPT=-DNDEBUG $(MAKE) -j32
+	OPT="-DNDEBUG -O2" $(MAKE) -j32
 
 coverage:
 	$(MAKE) clean

From f040e536e4c4128b6db5cdf71571925e56963db2 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 21 Nov 2013 11:11:02 -0800
Subject: [PATCH 13/70] [RocksDB Performance Branch] A more customized index in
 PlainTableReader

Summary:
PlainTableReader to use a more customized hash table. This patch assumes the SST file is smaller than 2GB:
(1) Every bucket uses 32-bit integer
(2) no key is stored in bucket
(3) use the first bit of the bucket value to distinguish it points to the file offset or a second level index.
This index schema fits the use case that most of prefixes have very small number of keys

Test Plan: plain_table_db_test

Reviewers: haobo, kailiu, dhruba

Reviewed By: haobo

CC: nkg-, leveldb

Differential Revision: https://reviews.facebook.net/D14343
---
 include/rocksdb/plain_table_factory.h |   4 +-
 table/plain_table_builder.cc          |   8 +-
 table/plain_table_reader.cc           | 341 +++++++++++++++-----------
 table/plain_table_reader.h            |  48 ++--
 4 files changed, 237 insertions(+), 164 deletions(-)

diff --git a/include/rocksdb/plain_table_factory.h b/include/rocksdb/plain_table_factory.h
index 3d26c6e4e..2355e43d4 100644
--- a/include/rocksdb/plain_table_factory.h
+++ b/include/rocksdb/plain_table_factory.h
@@ -23,7 +23,9 @@ class TableBuilder;
 
 // IndexedTable requires fixed length key, configured as a constructor
 // parameter of the factory class. Output file format:
-// +--------------------------------------------+  <= key1 offset
+// +-------------+
+// | version     |
+// +-------------+------------------------------+  <= key1 offset
 // | key1            | value_size (4 bytes) |   |
 // +----------------------------------------+   |
 // | value1                                     |
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index ed0b4d988..5a6e41df6 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -25,6 +25,10 @@ PlainTableBuilder::PlainTableBuilder(const Options& options,
                                      int user_key_size, int key_prefix_len) :
     options_(options), file_(file), user_key_size_(user_key_size),
     key_prefix_len_(key_prefix_len) {
+  std::string version;
+  PutFixed32(&version, 1 | 0x80000000);
+  file_->Append(Slice(version));
+  offset_ = 4;
 }
 
 PlainTableBuilder::~PlainTableBuilder() {
@@ -43,11 +47,11 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
 
   std::string size;
   int value_size = value.size();
-  PutFixed32(&size, value_size);
+  PutVarint32(&size, value_size);
   Slice sizeSlice(size);
   file_->Append(sizeSlice);
   file_->Append(value);
-  offset_ += value_size + 4;
+  offset_ += value_size + size.length();
 
   num_entries_++;
 }
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 9d0283b22..e7f48df33 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -5,6 +5,7 @@
 #include "table/plain_table_reader.h"
 
 #include <unordered_map>
+#include <map>
 
 #include "db/dbformat.h"
 
@@ -23,21 +24,35 @@
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/histogram.h"
+#include "util/murmurhash.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
 
+
 namespace std {
 template<>
 struct hash<rocksdb::Slice> {
 public:
   std::size_t operator()(rocksdb::Slice const& s) const {
-    return rocksdb::Hash(s.data(), s.size(), 397);
+    return MurmurHash(s.data(), s.size(), 397);
+  }
+};
+
+class slice_comparator {
+public:
+  bool operator()(rocksdb::Slice const& s1, rocksdb::Slice const& s2) {
+    return s1.compare(s2) < 0;
   }
 };
 }
 
 namespace rocksdb {
 
+static uint32_t getBucketId(Slice const& s, size_t prefix_len,
+                            uint32_t num_buckets) {
+  return MurmurHash(s.data(), prefix_len, 397) % num_buckets;
+}
+
 PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
                                    uint64_t file_size, int user_key_size,
                                    int key_prefix_len, int bloom_bits_per_key,
@@ -51,6 +66,8 @@ PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
     filter_policy_ = nullptr;
   }
   hash_table_ = nullptr;
+  data_start_offset_ = 0;
+  data_end_offset_ = file_size;
 }
 
 PlainTableReader::~PlainTableReader() {
@@ -73,6 +90,10 @@ Status PlainTableReader::Open(const Options& options,
                               double hash_table_ratio) {
   assert(options.allow_mmap_reads);
 
+  if (file_size > 2147483646) {
+    return Status::NotSupported("File is too large for PlainTableReader!");
+  }
+
   PlainTableReader* t = new PlainTableReader(soptions, file_size,
                                              user_key_size,
                                              key_prefix_len,
@@ -101,104 +122,111 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
 }
 
 Status PlainTableReader::PopulateIndex(uint64_t file_size) {
+  // Get mmapped memory to file_data_.
+  Status s = file_->Read(0, file_size_, &file_data_, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  version_ = DecodeFixed32(file_data_.data());
+  version_ ^= 0x80000000;
+  assert(version_ == 1);
+  data_start_offset_ = 4;
+  data_end_offset_ = file_size;
+
   Slice key_slice;
   Slice key_prefix_slice;
   Slice key_suffix_slice;
   Slice value_slice;
-  Slice tmp_slice;
   Slice prev_key_prefix_slice;
-  uint64_t pos = 0;
-  uint64_t data_offset_for_cur_prefix = 0;
-  int count_prefix = 0;
+  uint32_t pos = data_start_offset_;
+  int key_index_within_prefix = 0;
   bool first = true;
   std::string prefix_sub_index;
   HistogramImpl keys_per_prefix_hist;
-  std::unordered_map<Slice, uint64_t> tmp_index;
+  // Need map to be ordered to make sure sub indexes generated
+  // are in order.
+  std::map<Slice, std::string, std::slice_comparator> prefix2map;
 
   while (pos < file_size) {
-    uint64_t key_offset = pos;
-    pos = Next(pos, &key_slice, &value_slice, &tmp_slice);
+    uint32_t key_offset = pos;
+    status_ = Next(pos, &key_slice, &value_slice, pos);
     key_prefix_slice = Slice(key_slice.data(), key_prefix_len_);
 
     if (first || prev_key_prefix_slice != key_prefix_slice) {
       if (!first) {
-        if (count_prefix < 8 || key_prefix_len_ == user_key_size_) {
-          tmp_index[prev_key_prefix_slice] = data_offset_for_cur_prefix;
-        } else {
-          tmp_index[prev_key_prefix_slice] = sub_index_.length()
-              | kSubIndexMask;
-          PutFixed32(&sub_index_, (count_prefix - 1) / 8 + 1);
-          sub_index_.append(prefix_sub_index);
-        }
-        prefix_sub_index.clear();
-        data_offset_for_cur_prefix = key_offset;
-        keys_per_prefix_hist.Add(count_prefix);
+        keys_per_prefix_hist.Add(key_index_within_prefix);
       }
+      key_index_within_prefix = 0;
       prev_key_prefix_slice = key_prefix_slice;
-      count_prefix = 1;
-    } else {
-      count_prefix++;
-    }
-    if (key_prefix_len_ < user_key_size_ && count_prefix % 8 == 1) {
-      prefix_sub_index.append(key_slice.data() + key_prefix_len_,
-                              user_key_size_ - key_prefix_len_);
-      PutFixed64(&prefix_sub_index, key_offset);
     }
 
+    if (key_index_within_prefix++ % 8 == 0) {
+      // Add an index key for every 8 keys
+      std::string& prefix_index = prefix2map[key_prefix_slice];
+      PutFixed32(&prefix_index, key_offset);
+    }
     first = false;
   }
-  keys_per_prefix_hist.Add(count_prefix);
-  if (count_prefix <= 2 || key_prefix_len_ == user_key_size_) {
-    tmp_index[prev_key_prefix_slice] = data_offset_for_cur_prefix;
-  } else {
-    tmp_index[prev_key_prefix_slice] = sub_index_.length() | kSubIndexMask;
-    PutFixed32(&sub_index_, (count_prefix - 1) / 8 + 1);
-    sub_index_.append(prefix_sub_index);
-  }
-
+  keys_per_prefix_hist.Add(key_index_within_prefix);
   if (hash_table_ != nullptr) {
     delete[] hash_table_;
   }
-  // Make the hash table 3/5 full
   std::vector<Slice> filter_entries(0); // for creating bloom filter;
   if (filter_policy_ != nullptr) {
-    filter_entries.reserve(tmp_index.size());
+    filter_entries.reserve(prefix2map.size());
   }
   double hash_table_size_multipier =
       (hash_table_ratio_ > 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
-  hash_table_size_ = tmp_index.size() * hash_table_size_multipier + 1;
-  hash_table_ = new char[GetHashTableRecordLen() * hash_table_size_];
-  for (int i = 0; i < hash_table_size_; i++) {
-    memcpy(GetHashTableBucketPtr(i) + key_prefix_len_, &file_size_,
-           kOffsetLen);
-  }
-
-  for (auto it = tmp_index.begin(); it != tmp_index.end(); ++it) {
+  hash_table_size_ = prefix2map.size() * hash_table_size_multipier + 1;
+  hash_table_ = new uint32_t[hash_table_size_];
+  std::vector<std::string> hash2map(hash_table_size_);
+
+  size_t sub_index_size_needed = 0;
+  for (auto& p: prefix2map) {
+    auto& sub_index = hash2map[getBucketId(p.first, key_prefix_len_,
+                                           hash_table_size_)];
+    if (sub_index.length() > 0 || p.second.length() > kOffsetLen) {
+      if (sub_index.length() <= kOffsetLen) {
+        sub_index_size_needed += sub_index.length() + 4;
+      }
+      sub_index_size_needed += p.second.length();
+    }
+    sub_index.append(p.second);
     if (filter_policy_ != nullptr) {
-      filter_entries.push_back(it->first);
+      filter_entries.push_back(p.first);
     }
+  }
 
-    int bucket = GetHashTableBucket(it->first);
-    uint64_t* hash_value;
-    while (true) {
-      GetHashValue(bucket, &hash_value);
-      if (*hash_value == file_size_) {
-        break;
-      }
-      bucket = (bucket + 1) % hash_table_size_;
+  sub_index_.clear();
+  Log(options_.info_log, "Reserving %zu bytes for sub index",
+      sub_index_size_needed);
+  sub_index_.reserve(sub_index_size_needed);
+  for (int i = 0; i < hash_table_size_; i++) {
+    uint32_t num_keys_for_bucket = hash2map[i].length() / kOffsetLen;
+    switch (num_keys_for_bucket) {
+    case 0:
+      // No key for bucket
+      hash_table_[i] = data_end_offset_;
+      break;
+    case 1:
+      // point directly to the file offset
+      hash_table_[i] = DecodeFixed32(hash2map[i].data());
+      break;
+    default:
+      // point to index block
+      hash_table_[i] = sub_index_.length() | kSubIndexMask;
+      PutFixed32(&sub_index_, num_keys_for_bucket);
+      sub_index_.append(hash2map[i]);
     }
-
-    char* bucket_ptr = GetHashTableBucketPtr(bucket);
-    memcpy(bucket_ptr, it->first.data(), key_prefix_len_);
-    memcpy(bucket_ptr + key_prefix_len_, &it->second, kOffsetLen);
   }
   if (filter_policy_ != nullptr) {
+    filter_str_.clear();
     filter_policy_->CreateFilter(&filter_entries[0], filter_entries.size(),
                                  &filter_str_);
     filter_slice_ = Slice(filter_str_.data(), filter_str_.size());
   }
 
-  Log(options_.info_log, "Number of prefixes: %d, suffix_map length %ld",
+  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
       hash_table_size_, sub_index_.length());
   Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
       keys_per_prefix_hist.ToString().c_str());
@@ -206,51 +234,33 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
   return Status::OK();
 }
 
-inline int PlainTableReader::GetHashTableBucket(Slice key) {
-  return rocksdb::Hash(key.data(), key_prefix_len_, 397) % hash_table_size_;
-}
-
-inline void PlainTableReader::GetHashValue(int bucket, uint64_t** ret_value) {
-  *ret_value = (uint64_t*) (GetHashTableBucketPtr(bucket) + key_prefix_len_);
-}
-
-Status PlainTableReader::GetOffset(const Slice& target, uint64_t* offset) {
-  Status s;
-  int bucket = GetHashTableBucket(target);
-  uint64_t* found_value;
-  Slice hash_key;
-  while (true) {
-    GetHashValue(bucket, &found_value);
-    if (*found_value == file_size_) {
-      break;
-    }
-    GetHashKey(bucket, &hash_key);
-    if (target.starts_with(hash_key)) {
-      break;
-    }
-    bucket = (bucket + 1) % hash_table_size_;
-  }
-
-  if (*found_value == file_size_ || (*found_value & kSubIndexMask) == 0) {
-    *offset = *found_value;
-    return Status::OK();
+uint32_t PlainTableReader::GetOffset(const Slice& target,
+                                     bool& prefix_matched) {
+  prefix_matched = false;
+  int bucket = getBucketId(target, key_prefix_len_, hash_table_size_);
+  uint32_t bucket_value = hash_table_[bucket];
+  if (bucket_value == data_end_offset_) {
+    return data_end_offset_;
+  } else if ((bucket_value & kSubIndexMask) == 0) {
+    // point directly to the file
+    return bucket_value;
   }
+  // point to sub-index, need to do a binary search
 
   uint32_t low = 0;
-  uint64_t prefix_index_offset = *found_value ^ kSubIndexMask;
-  uint32_t high = DecodeFixed32(sub_index_.data() + prefix_index_offset);
+  uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
+  uint32_t upper_bound = DecodeFixed32(sub_index_.data() + prefix_index_offset);
+  uint32_t high = upper_bound;
   uint64_t base_offset = prefix_index_offset + 4;
-  char* mid_key_str = new char[target.size()];
-  memcpy(mid_key_str, target.data(), target.size());
-  Slice mid_key = Slice(mid_key_str, target.size());
+  Slice mid_key;
 
-  // The key is between (low, high). Do a binary search between it.
+  // The key is between [low, high). Do a binary search between it.
   while (high - low > 1) {
     uint32_t mid = (high + low) / 2;
-    const char* base = sub_index_.data() + base_offset
-        + (user_key_size_ - key_prefix_len_ + kOffsetLen) * mid;
-    memcpy(mid_key_str + key_prefix_len_, base,
-           user_key_size_ - key_prefix_len_);
+    const char* index_offset = sub_index_.data() + base_offset
+        + kOffsetLen * mid;
+    uint32_t file_offset = DecodeFixed32(index_offset);
+    mid_key = Slice(file_data_.data() + file_offset, user_key_size_);
 
     int cmp_result = options_.comparator->Compare(target, mid_key);
     if (cmp_result > 0) {
@@ -259,21 +269,32 @@ Status PlainTableReader::GetOffset(const Slice& target, uint64_t* offset) {
       if (cmp_result == 0) {
         // Happen to have found the exact key or target is smaller than the
         // first key after base_offset.
-        *offset = DecodeFixed64(base + user_key_size_ - key_prefix_len_);
-        delete[] mid_key_str;
-        return s;
+        prefix_matched = true;
+        return file_offset;
       } else {
         high = mid;
       }
     }
   }
 
-  const char* base = sub_index_.data() + base_offset
-      + (user_key_size_ - key_prefix_len_ + kOffsetLen) * low;
-  *offset = DecodeFixed64(base + user_key_size_ - key_prefix_len_);
-
-  delete[] mid_key_str;
-  return s;
+  // The key is between low and low+1 (if exists). Both of them can have the
+  // correct prefix. Need to rule out at least one, to avoid to miss the
+  // correct one.
+  uint32_t low_key_offset = DecodeFixed32(
+      sub_index_.data() + base_offset + kOffsetLen * low);
+  if (low + 1 < upper_bound) {
+    if (Slice(file_data_.data() + low_key_offset, key_prefix_len_)
+        == Slice(target.data(), key_prefix_len_)) {
+      prefix_matched = true;
+    } else {
+      prefix_matched = false;
+      return DecodeFixed32(
+          sub_index_.data() + base_offset + kOffsetLen * (low + 1));
+    }
+  } else {
+    prefix_matched = false;
+  }
+  return low_key_offset;
 }
 
 bool PlainTableReader::MayHavePrefix(const Slice& target_prefix) {
@@ -282,46 +303,74 @@ bool PlainTableReader::MayHavePrefix(const Slice& target_prefix) {
 }
 
 
-uint64_t PlainTableReader::Next(uint64_t offset, Slice* key, Slice* value,
-                                Slice* tmp_slice) {
-  if (offset >= file_size_) {
-    return file_size_;
+Status PlainTableReader::Next(uint32_t offset, Slice* key, Slice* value,
+                              uint32_t& next_offset) {
+  if (offset == data_end_offset_) {
+    next_offset = data_end_offset_;
+    return Status::OK();
+  }
+
+  if (offset > data_end_offset_) {
+    return Status::Corruption("Offset is out of file size");
   }
-  int internal_key_size = GetInternalKeyLength();
 
-  Status s = file_->Read(offset, internal_key_size, key, nullptr);
-  offset += internal_key_size;
+  int internal_key_size = GetInternalKeyLength();
+  if (offset + internal_key_size >= data_end_offset_) {
+    return Status::Corruption("Un able to read the next key");
+  }
 
-  s = file_->Read(offset, 4, tmp_slice, nullptr);
-  offset += 4;
-  uint32_t value_size = DecodeFixed32(tmp_slice->data());
+  const char* key_ptr =  file_data_.data() + offset;
+  *key = Slice(key_ptr, internal_key_size);
 
-  s = file_->Read(offset, value_size, value, nullptr);
-  offset += value_size;
+  uint32_t value_size;
+  const char* value_ptr = GetVarint32Ptr(key_ptr + internal_key_size,
+                                         file_data_.data() + data_end_offset_,
+                                         &value_size);
+  if (value_ptr == nullptr) {
+    return Status::Corruption("Error reading value length.");
+  }
+  next_offset = offset + (value_ptr - key_ptr) + value_size;
+  if (next_offset > data_end_offset_) {
+    return Status::Corruption("Reach end of file when reading value");
+  }
+  *value = Slice(value_ptr, value_size);
 
-  return offset;
+  return Status::OK();
 }
 
 Status PlainTableReader::Get(
     const ReadOptions& ro, const Slice& target, void* arg,
     bool (*saver)(void*, const Slice&, const Slice&, bool),
     void (*mark_key_may_exist)(void*)) {
-  uint64_t offset;
-  Status s = GetOffset(target, &offset);
-  if (!s.ok()) {
-    return s;
+  // Check bloom filter first.
+  if (!MayHavePrefix(Slice(target.data(), key_prefix_len_))) {
+    return Status::OK();
   }
+
+  uint32_t offset;
+  bool prefix_match;
+  offset = GetOffset(target, prefix_match);
   Slice found_key;
   Slice found_value;
-  Slice tmp_slice;
-  while (offset < file_size_) {
-    offset = Next(offset, &found_key, &found_value, &tmp_slice);
+  while (offset < data_end_offset_) {
+    Status s = Next(offset, &found_key, &found_value, offset);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!prefix_match) {
+      // Need to verify prefix for the first key found if it is not yet
+      // checked.
+      if (!target.starts_with(Slice(found_key.data(), key_prefix_len_))) {
+        break;
+      }
+      prefix_match = true;
+    }
     if (options_.comparator->Compare(found_key, target) >= 0
         && !(*saver)(arg, found_key, found_value, true)) {
       break;
     }
   }
-  return s;
+  return Status::OK();
 }
 
 bool PlainTableReader::TEST_KeyInCache(const ReadOptions& options,
@@ -342,11 +391,12 @@ PlainTableIterator::~PlainTableIterator() {
 }
 
 bool PlainTableIterator::Valid() const {
-  return offset_ < table_->file_size_ && offset_ >= 0;
+  return offset_ < table_->data_end_offset_
+      && offset_ >= table_->data_start_offset_;
 }
 
 void PlainTableIterator::SeekToFirst() {
-  next_offset_ = 0;
+  next_offset_ = table_->data_start_offset_;
   Next();
 }
 
@@ -356,26 +406,35 @@ void PlainTableIterator::SeekToLast() {
 
 void PlainTableIterator::Seek(const Slice& target) {
   if (!table_->MayHavePrefix(Slice(target.data(), table_->key_prefix_len_))) {
-    offset_ = next_offset_ = table_->file_size_;
+    offset_ = next_offset_ = table_->data_end_offset_;
     return;
   }
-
-  Status s = table_->GetOffset(target, &next_offset_);
-  if (!s.ok()) {
-    status_ = s;
-  }
-  if (next_offset_ < table_->file_size_) {
-    for (Next();
-        Valid() && table_->options_.comparator->Compare(key(), target) < 0;
-        Next()) {
+  bool prefix_match;
+  next_offset_ = table_->GetOffset(target, prefix_match);
+
+  if (next_offset_ < table_-> data_end_offset_) {
+    for (Next(); status_.ok() && Valid(); Next()) {
+      if (!prefix_match) {
+        // Need to verify the first key's prefix
+        if (!target.starts_with(Slice(key().data(), table_->key_prefix_len_))) {
+          offset_ = next_offset_ = table_->data_end_offset_;
+          break;
+        }
+        prefix_match = true;
+      }
+      if (table_->options_.comparator->Compare(key(), target) >= 0) {
+        break;
+      }
     }
+  } else {
+    offset_ = table_->data_end_offset_;
   }
 }
 
 void PlainTableIterator::Next() {
   offset_ = next_offset_;
   Slice tmp_slice;
-  next_offset_ = table_->Next(next_offset_, &key_, &value_, &tmp_slice);
+  status_ = table_->Next(next_offset_, &key_, &value_, next_offset_);
 }
 
 void PlainTableIterator::Prev() {
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index d9ac34326..eea8adfe6 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -25,7 +25,9 @@ using std::unique_ptr;
 using std::unordered_map;
 
 // Based on following output file format:
-// +--------------------------------------------+  <= key1_data_offset
+// +-------------+
+// | version     |
+// +-------------+------------------------------+  <= key1_data_offset
 // | key1            | value_size (4 bytes) |   |
 // +----------------------------------------+   |
 // | value1                                     |
@@ -85,7 +87,7 @@ public:
   ~PlainTableReader();
 
 private:
-  char* hash_table_;
+  uint32_t* hash_table_;
   int hash_table_size_;
   std::string sub_index_;
 
@@ -94,7 +96,11 @@ private:
   Status status_;
   unique_ptr<RandomAccessFile> file_;
 
-  uint64_t file_size_;
+  Slice file_data_;
+  uint32_t version_;
+  uint32_t file_size_;
+  uint32_t data_start_offset_;
+  uint32_t data_end_offset_;
   const size_t user_key_size_;
   const size_t key_prefix_len_;
   const double hash_table_ratio_;
@@ -105,32 +111,34 @@ private:
   TableProperties tbl_props;
 
   static const size_t kNumInternalBytes = 8;
-  static const uint64_t kSubIndexMask = 0x8000000000000000;
-  static const size_t kOffsetLen = sizeof(uint64_t);
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
 
-  inline int GetHashTableBucket(Slice key);
   inline size_t GetInternalKeyLength() {
     return user_key_size_ + kNumInternalBytes;
   }
-  inline size_t GetHashTableRecordLen() {
-    return key_prefix_len_ + kOffsetLen;
-  }
-  inline char* GetHashTableBucketPtr(int bucket) {
-    return hash_table_ + GetHashTableRecordLen() * bucket;
-  }
-  inline void GetHashKey(int bucket, Slice* slice) {
-    *slice = Slice(GetHashTableBucketPtr(bucket), key_prefix_len_);
-  }
-  inline void GetHashValue(int bucket, uint64_t** ret_value);
 
   friend class TableCache;
   friend class PlainTableIterator;
 
+  // Populate the internal indexes. It must be called before
+  // any query to the table.
+  // This query will populate the hash table hash_table_, the second
+  // level of indexes sub_index_ and bloom filter filter_slice_ if enabled.
   Status PopulateIndex(uint64_t file_size);
-  uint64_t Next(uint64_t offset, Slice* key, Slice* value, Slice* tmp_slice);
-  Status GetOffset(const Slice& target, uint64_t* offset);
+
+  // Check bloom filter to see whether it might contain this prefix
   bool MayHavePrefix(const Slice& target_prefix);
 
+  // Read the key and value at offset to key and value.
+  // tmp_slice is a tmp slice.
+  // return next_offset as the offset for the next key.
+  Status Next(uint32_t offset, Slice* key, Slice* value, uint32_t& next_offset);
+  // Get file offset for key target.
+  // return value prefix_matched is set to true if the offset is confirmed
+  // for a key with the same prefix as target.
+  uint32_t GetOffset(const Slice& target, bool& prefix_matched);
+
   // No copying allowed
   explicit PlainTableReader(const TableReader&) = delete;
   void operator=(const TableReader&) = delete;
@@ -162,8 +170,8 @@ public:
 
 private:
   PlainTableReader* table_;
-  uint64_t offset_;
-  uint64_t next_offset_;
+  uint32_t offset_;
+  uint32_t next_offset_;
   Slice key_;
   Slice value_;
   Status status_;

From 3a0e98d558194eab4ec94de92acdc031fe4ccaf0 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Wed, 4 Dec 2013 15:09:41 -0800
Subject: [PATCH 14/70] Parameterize table magic number

Summary:

As we are having different types of tables and they all might share the same structure in block-based table:

[metaindex block]
[index block]
[Footer]

To be able to identify differnt types of tables, we need to parameterize the "magic number" in the `Footer`.

Test Plan:

make check
---
 table/block_based_table_builder.cc | 10 +++++++++-
 table/block_based_table_builder.h  |  1 -
 table/block_based_table_reader.cc  |  4 +++-
 table/format.h                     | 13 +++++++------
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index f846b1ffd..fa7d56472 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -77,6 +77,14 @@ void LogPropertiesCollectionError(
 
 }  // anonymous namespace
 
+// kBlockedBasedTableMagicNumber was picked by running
+//    echo http://code.google.com/p/leveldb/ | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockedBasedTableMagicNumber may also be accessed by
+// other .cc files so it have to be explicitly declared with "extern".
+extern const uint64_t kBlockedBasedTableMagicNumber
+    = 0xdb4775248b80fb57ull;
+
 struct BlockBasedTableBuilder::Rep {
   Options options;
   WritableFile* file;
@@ -503,7 +511,7 @@ Status BlockBasedTableBuilder::Finish() {
 
   // Write footer
   if (ok()) {
-    Footer footer;
+    Footer footer(kBlockedBasedTableMagicNumber);
     footer.set_metaindex_handle(metaindex_block_handle);
     footer.set_index_handle(index_block_handle);
     std::string footer_encoding;
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
index 517f8e785..710bfd5a1 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@@ -20,7 +20,6 @@ class BlockBuilder;
 class BlockHandle;
 class WritableFile;
 
-
 class BlockBasedTableBuilder : public TableBuilder {
  public:
   // Create a builder that will store the contents of the table it is
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 095c2999c..d5b52fba2 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -29,6 +29,8 @@
 
 namespace rocksdb {
 
+extern uint64_t kBlockedBasedTableMagicNumber;
+
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
 // varints.
@@ -242,7 +244,7 @@ Status BlockBasedTable::Open(const Options& options,
     return Status::InvalidArgument("file is too short to be an sstable");
   }
 
-  Footer footer;
+  Footer footer(kBlockedBasedTableMagicNumber);
   s = footer.DecodeFrom(&footer_input);
   if (!s.ok()) return s;
 
diff --git a/table/format.h b/table/format.h
index 2f1c1e8dc..bac51eab4 100644
--- a/table/format.h
+++ b/table/format.h
@@ -50,7 +50,12 @@ class BlockHandle {
 // end of every table file.
 class Footer {
  public:
-  Footer() { }
+  // @table_magic_number serves two purposes:
+  //  1. Identify different types of the tables.
+  //  2. Help us to identify if a given file is a valid sst.
+  Footer(uint64_t table_magic_number) :
+      kTableMagicNumber(table_magic_number) {
+  }
 
   // The block handle for the metaindex block of the table
   const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
@@ -77,13 +82,9 @@ class Footer {
  private:
   BlockHandle metaindex_handle_;
   BlockHandle index_handle_;
+  const uint64_t kTableMagicNumber;
 };
 
-// kTableMagicNumber was picked by running
-//    echo http://code.google.com/p/leveldb/ | sha1sum
-// and taking the leading 64 bits.
-static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
-
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
 

From 5dec7acd91e8efe7bd31f9e93457d98c5dc822ff Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Wed, 4 Dec 2013 15:43:09 -0800
Subject: [PATCH 15/70] Introducing the concept of NULL block handle

---
 table/format.cc |  1 +
 table/format.h  | 26 ++++++++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/table/format.cc b/table/format.cc
index ff6d8fa24..17add6680 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -34,6 +34,7 @@ Status BlockHandle::DecodeFrom(Slice* input) {
     return Status::Corruption("bad block handle");
   }
 }
+const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 
 void Footer::EncodeTo(std::string* dst) const {
 #ifndef NDEBUG
diff --git a/table/format.h b/table/format.h
index bac51eab4..c10cab857 100644
--- a/table/format.h
+++ b/table/format.h
@@ -26,6 +26,7 @@ struct ReadOptions;
 class BlockHandle {
  public:
   BlockHandle();
+  BlockHandle(uint64_t offset, uint64_t size);
 
   // The offset of the block in the file.
   uint64_t offset() const { return offset_; }
@@ -38,12 +39,24 @@ class BlockHandle {
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(Slice* input);
 
+  // if the block handle's offset and size are both "0", we will view it
+  // as a null block handle that points to no where.
+  bool IsNull() const {
+    return offset_ == 0 && size_ == 0;
+  }
+
+  static const BlockHandle& NullBlockHandle() {
+    return kNullBlockHandle;
+  }
+
   // Maximum encoding length of a BlockHandle
   enum { kMaxEncodedLength = 10 + 10 };
 
  private:
-  uint64_t offset_;
-  uint64_t size_;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+
+  static const BlockHandle kNullBlockHandle;
 };
 
 // Footer encapsulates the fixed information stored at the tail
@@ -116,8 +129,13 @@ extern Status UncompressBlockContents(const char* data,
 // Implementation details follow.  Clients should ignore,
 
 inline BlockHandle::BlockHandle()
-    : offset_(~static_cast<uint64_t>(0)),
-      size_(~static_cast<uint64_t>(0)) {
+    : BlockHandle(~static_cast<uint64_t>(0),
+                  ~static_cast<uint64_t>(0)) {
+}
+
+inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
+    : offset_(offset),
+      size_(size) {
 }
 
 }  // namespace rocksdb

From 219b35be6adfef521d1a2ce9df9012e6d6e0ab3d Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Wed, 4 Dec 2013 16:35:48 -0800
Subject: [PATCH 16/70] Generalize footer reading from file

Summary:

Generalizing this process will help us to re-use the code for plain table

Test Plan:

ran ./table_test
---
 table/block_based_table_builder.cc |  8 ++++----
 table/block_based_table_reader.cc  | 21 +++------------------
 table/format.cc                    | 24 ++++++++++++++++++++++++
 table/format.h                     |  5 +++++
 4 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index fa7d56472..4cba3934f 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -77,12 +77,12 @@ void LogPropertiesCollectionError(
 
 }  // anonymous namespace
 
-// kBlockedBasedTableMagicNumber was picked by running
+// kBlockBasedTableMagicNumber was picked by running
 //    echo http://code.google.com/p/leveldb/ | sha1sum
 // and taking the leading 64 bits.
-// Please note that kBlockedBasedTableMagicNumber may also be accessed by
+// Please note that kBlockBasedTableMagicNumber may also be accessed by
 // other .cc files so it have to be explicitly declared with "extern".
-extern const uint64_t kBlockedBasedTableMagicNumber
+extern const uint64_t kBlockBasedTableMagicNumber
     = 0xdb4775248b80fb57ull;
 
 struct BlockBasedTableBuilder::Rep {
@@ -511,7 +511,7 @@ Status BlockBasedTableBuilder::Finish() {
 
   // Write footer
   if (ok()) {
-    Footer footer(kBlockedBasedTableMagicNumber);
+    Footer footer(kBlockBasedTableMagicNumber);
     footer.set_metaindex_handle(metaindex_block_handle);
     footer.set_index_handle(index_block_handle);
     std::string footer_encoding;
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index d5b52fba2..11b8f6ca8 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -29,7 +29,7 @@
 
 namespace rocksdb {
 
-extern uint64_t kBlockedBasedTableMagicNumber;
+extern uint64_t kBlockBasedTableMagicNumber;
 
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
@@ -228,24 +228,9 @@ Status BlockBasedTable::Open(const Options& options,
                              uint64_t size,
                              unique_ptr<TableReader>* table_reader) {
   table_reader->reset();
-  if (size < Footer::kEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
-  }
-
-  char footer_space[Footer::kEncodedLength];
-  Slice footer_input;
-  Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
-                        &footer_input, footer_space);
-  if (!s.ok()) return s;
-
-  // Check that we actually read the whole footer from the file. It may be
-  // that size isn't correct.
-  if (footer_input.size() != Footer::kEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
-  }
 
-  Footer footer(kBlockedBasedTableMagicNumber);
-  s = footer.DecodeFrom(&footer_input);
+  Footer footer(kBlockBasedTableMagicNumber);
+  auto s = ReadFooterFromFile(file.get(), size, &footer);
   if (!s.ok()) return s;
 
   // We've successfully read the footer and the index block: we're
diff --git a/table/format.cc b/table/format.cc
index 17add6680..77a55237e 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -73,6 +73,30 @@ Status Footer::DecodeFrom(Slice* input) {
   return result;
 }
 
+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer) {
+  if (file_size < Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  char footer_space[Footer::kEncodedLength];
+  Slice footer_input;
+  Status s = file->Read(file_size - Footer::kEncodedLength,
+                        Footer::kEncodedLength,
+                        &footer_input,
+                        footer_space);
+  if (!s.ok()) return s;
+
+  // Check that we actually read the whole footer from the file. It may be
+  // that size isn't correct.
+  if (footer_input.size() != Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  return footer->DecodeFrom(&footer_input);
+}
+
 Status ReadBlockContents(RandomAccessFile* file,
                          const ReadOptions& options,
                          const BlockHandle& handle,
diff --git a/table/format.h b/table/format.h
index c10cab857..207527fcb 100644
--- a/table/format.h
+++ b/table/format.h
@@ -98,6 +98,11 @@ class Footer {
   const uint64_t kTableMagicNumber;
 };
 
+// Read the footer from file
+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer);
+
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
 

From e1d92dfd2eb4bf00cea08ebe0f539eb11ff394fd Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Wed, 4 Dec 2013 23:00:33 -0800
Subject: [PATCH 17/70] Fix a bunch of mac compilation issues in performance
 branch

---
 table/plain_table_builder.cc | 3 +--
 table/plain_table_builder.h  | 1 -
 table/plain_table_reader.cc  | 3 ++-
 util/env_posix.cc            | 4 +++-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 5a6e41df6..30d7e7d6e 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -23,8 +23,7 @@ namespace rocksdb {
 PlainTableBuilder::PlainTableBuilder(const Options& options,
                                      WritableFile* file,
                                      int user_key_size, int key_prefix_len) :
-    options_(options), file_(file), user_key_size_(user_key_size),
-    key_prefix_len_(key_prefix_len) {
+    options_(options), file_(file), user_key_size_(user_key_size) {
   std::string version;
   PutFixed32(&version, 1 | 0x80000000);
   file_->Append(Slice(version));
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
index b48552efc..a994b337c 100644
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -75,7 +75,6 @@ private:
   uint64_t num_entries_ = 0;
 
   const size_t user_key_size_;
-  const size_t key_prefix_len_;
   bool closed_ = false;  // Either Finish() or Abandon() has been called.
 
   int GetInternalKeyLength() {
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index e7f48df33..cccaf61c9 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -40,7 +40,8 @@ public:
 
 class slice_comparator {
 public:
-  bool operator()(rocksdb::Slice const& s1, rocksdb::Slice const& s2) {
+  bool operator()(rocksdb::Slice const& s1,
+                  rocksdb::Slice const& s2) const {
     return s1.compare(s2) < 0;
   }
 };
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 28901be3f..1ed8d6960 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1404,11 +1404,13 @@ class PosixEnv : public Env {
                 (unsigned long)t);
 
         // Set the thread name to aid debugging
-#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) && (__GLIBC_PREREQ(2, 12))
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
         char name_buf[16];
         snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size());
         name_buf[sizeof name_buf - 1] = '\0';
         pthread_setname_np(t, name_buf);
+#endif
 #endif
 
         bgthreads_.push_back(t);

From 90729f8b23918d8448d15abd8f6ef216a7aa641a Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Thu, 5 Dec 2013 13:09:13 -0800
Subject: [PATCH 18/70] Extract metaindex block from block-based table

Summary: This change will allow other table to reuse the code for meta blocks.

Test Plan: all existing unit tests passed

Reviewers: dhruba, haobo, sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14475
---
 db/table_properties_collector.cc      |  91 +------------
 db/table_properties_collector.h       |  12 +-
 db/table_properties_collector_test.cc |   7 +-
 include/rocksdb/options.h             |   5 +-
 include/rocksdb/table_properties.h    |  38 ++++--
 table/block_based_table_builder.cc    | 176 ++++++--------------------
 table/block_based_table_reader.cc     |  37 ++----
 table/block_based_table_reader.h      |  11 --
 table/meta_blocks.cc                  | 134 ++++++++++++++++++++
 table/meta_blocks.h                   | 106 ++++++++++++++++
 table/table_properties.cc             | 108 ++++++++++++++++
 11 files changed, 432 insertions(+), 293 deletions(-)
 create mode 100644 table/meta_blocks.cc
 create mode 100644 table/meta_blocks.h
 create mode 100644 table/table_properties.cc

diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc
index 3654663c1..25bd70036 100644
--- a/db/table_properties_collector.cc
+++ b/db/table_properties_collector.cc
@@ -10,87 +10,6 @@
 
 namespace rocksdb {
 
-namespace {
-  void AppendProperty(
-      std::string& props,
-      const std::string& key,
-      const std::string& value,
-      const std::string& prop_delim,
-      const std::string& kv_delim) {
-    props.append(key);
-    props.append(kv_delim);
-    props.append(value);
-    props.append(prop_delim);
-  }
-
-  template <class TValue>
-  void AppendProperty(
-      std::string& props,
-      const std::string& key,
-      const TValue& value,
-      const std::string& prop_delim,
-      const std::string& kv_delim) {
-    AppendProperty(
-        props, key, std::to_string(value), prop_delim, kv_delim
-    );
-  }
-}
-
-std::string TableProperties::ToString(
-    const std::string& prop_delim,
-    const std::string& kv_delim) const {
-  std::string result;
-  result.reserve(1024);
-
-  // Basic Info
-  AppendProperty(
-      result, "# data blocks", num_data_blocks, prop_delim, kv_delim
-  );
-  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
-
-  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
-  AppendProperty(
-      result,
-      "raw average key size",
-      num_entries != 0 ?  1.0 * raw_key_size / num_entries : 0.0,
-      prop_delim,
-      kv_delim
-  );
-  AppendProperty(
-      result, "raw value size", raw_value_size, prop_delim, kv_delim
-  );
-  AppendProperty(
-      result,
-      "raw average value size",
-      num_entries != 0 ?  1.0 * raw_value_size / num_entries : 0.0,
-      prop_delim,
-      kv_delim
-  );
-
-  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
-  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
-  AppendProperty(
-      result, "filter block size", filter_size, prop_delim, kv_delim
-  );
-  AppendProperty(
-      result,
-      "(estimated) table size",
-      data_size + index_size + filter_size,
-      prop_delim,
-      kv_delim
-  );
-
-  AppendProperty(
-      result,
-      "filter policy name",
-      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
-      prop_delim,
-      kv_delim
-  );
-
-  return result;
-}
-
 Status InternalKeyPropertiesCollector::Add(
     const Slice& key, const Slice& value) {
   ParsedInternalKey ikey;
@@ -106,7 +25,7 @@ Status InternalKeyPropertiesCollector::Add(
 }
 
 Status InternalKeyPropertiesCollector::Finish(
-    TableProperties::UserCollectedProperties* properties) {
+    UserCollectedProperties* properties) {
   assert(properties);
   assert(properties->find(
         InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
@@ -118,7 +37,7 @@ Status InternalKeyPropertiesCollector::Finish(
   return Status::OK();
 }
 
-TableProperties::UserCollectedProperties
+UserCollectedProperties
 InternalKeyPropertiesCollector::GetReadableProperties() const {
   return {
     { "kDeletedKeys", std::to_string(deleted_keys_) }
@@ -137,11 +56,11 @@ Status UserKeyTablePropertiesCollector::Add(
 }
 
 Status UserKeyTablePropertiesCollector::Finish(
-    TableProperties::UserCollectedProperties* properties) {
+    UserCollectedProperties* properties) {
   return collector_->Finish(properties);
 }
 
-TableProperties::UserCollectedProperties
+UserCollectedProperties
 UserKeyTablePropertiesCollector::GetReadableProperties() const {
   return collector_->GetReadableProperties();
 }
@@ -151,7 +70,7 @@ const std::string InternalKeyTablePropertiesNames::kDeletedKeys
   = "rocksdb.deleted.keys";
 
 uint64_t GetDeletedKeys(
-    const TableProperties::UserCollectedProperties& props) {
+    const UserCollectedProperties& props) {
   auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
   if (pos == props.end()) {
     return 0;
diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h
index 533130db7..6cf56291a 100644
--- a/db/table_properties_collector.h
+++ b/db/table_properties_collector.h
@@ -24,15 +24,13 @@ class InternalKeyPropertiesCollector : public TablePropertiesCollector {
  public:
   virtual Status Add(const Slice& key, const Slice& value) override;
 
-  virtual Status Finish(
-      TableProperties::UserCollectedProperties* properties) override;
+  virtual Status Finish(UserCollectedProperties* properties) override;
 
   virtual const char* Name() const override {
     return "InternalKeyPropertiesCollector";
   }
 
-  TableProperties::UserCollectedProperties
-    GetReadableProperties() const override;
+  UserCollectedProperties GetReadableProperties() const override;
 
  private:
   uint64_t deleted_keys_ = 0;
@@ -61,13 +59,11 @@ class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
 
   virtual Status Add(const Slice& key, const Slice& value) override;
 
-  virtual Status Finish(
-      TableProperties::UserCollectedProperties* properties) override;
+  virtual Status Finish(UserCollectedProperties* properties) override;
 
   virtual const char* Name() const override { return collector_->Name(); }
 
-  TableProperties::UserCollectedProperties
-    GetReadableProperties() const override;
+  UserCollectedProperties GetReadableProperties() const override;
 
  protected:
   std::shared_ptr<TablePropertiesCollector> collector_;
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 6f405b28a..bbac4aa64 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -114,10 +114,10 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
  public:
    const char* Name() const { return "RegularKeysStartWithA"; }
 
-   Status Finish(TableProperties::UserCollectedProperties* properties) {
+   Status Finish(UserCollectedProperties* properties) {
      std::string encoded;
      PutVarint32(&encoded, count_);
-     *properties = TableProperties::UserCollectedProperties {
+     *properties = UserCollectedProperties {
        { "TablePropertiesTest", "Rocksdb" },
        { "Count", encoded }
      };
@@ -132,8 +132,7 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
      return Status::OK();
    }
 
-  virtual TableProperties::UserCollectedProperties
-    GetReadableProperties() const {
+  virtual UserCollectedProperties GetReadableProperties() const {
       return {};
   }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 0cc33be68..d5f671ebe 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -611,8 +611,9 @@ struct Options {
   // the tables.
   // Default: emtpy vector -- no user-defined statistics collection will be
   // performed.
-  std::vector<std::shared_ptr<TablePropertiesCollector>>
-    table_properties_collectors;
+  typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
+          TablePropertiesCollectors;
+  TablePropertiesCollectors table_properties_collectors;
 
   // Allows thread-safe inplace updates. Requires Updates iff
   // * key exists in current memtable
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 8824ca13c..75c8bcc16 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -11,18 +11,18 @@
 
 namespace rocksdb {
 
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interprete these values by themselves.
+typedef
+  std::unordered_map<std::string, std::string>
+  UserCollectedProperties;
+
 // TableProperties contains a bunch of read-only properties of its associated
 // table.
 struct TableProperties {
  public:
-  // Other than basic table properties, each table may also have the user
-  // collected properties.
-  // The value of the user-collected properties are encoded as raw bytes --
-  // users have to interprete these values by themselves.
-  typedef
-    std::unordered_map<std::string, std::string>
-    UserCollectedProperties;
-
   // the total size of all data blocks.
   uint64_t data_size = 0;
   // the size of index block.
@@ -52,6 +52,19 @@ struct TableProperties {
       const std::string& kv_delim = "=") const;
 };
 
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+  static const std::string kDataSize;
+  static const std::string kIndexSize;
+  static const std::string kFilterSize;
+  static const std::string kRawKeySize;
+  static const std::string kRawValueSize;
+  static const std::string kNumDataBlocks;
+  static const std::string kNumEntries;
+  static const std::string kFilterPolicy;
+};
+
+
 // `TablePropertiesCollector` provides the mechanism for users to collect
 // their own interested properties. This class is essentially a collection
 //  of callback functions that will be invoked during table building.
@@ -68,23 +81,20 @@ class TablePropertiesCollector {
   // for writing the properties block.
   // @params properties  User will add their collected statistics to
   // `properties`.
-  virtual Status Finish(
-      TableProperties::UserCollectedProperties* properties) = 0;
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
 
   // The name of the properties collector can be used for debugging purpose.
   virtual const char* Name() const = 0;
 
   // Return the human-readable properties, where the key is property name and
   // the value is the human-readable form of value.
-  virtual TableProperties::UserCollectedProperties
-    GetReadableProperties() const = 0;
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
 };
 
 // Extra properties
 // Below is a list of non-basic properties that are collected by database
 // itself. Especially some properties regarding to the internal keys (which
 // is unknown to `table`).
-extern uint64_t GetDeletedKeys(
-    const TableProperties::UserCollectedProperties& props);
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
 
 }  // namespace rocksdb
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 4cba3934f..a6dbe3519 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -26,6 +26,7 @@
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
@@ -34,47 +35,11 @@ namespace rocksdb {
 
 namespace {
 
-struct BytewiseLessThan {
-  bool operator()(const std::string& key1, const std::string& key2) const {
-    // smaller entries will be placed in front.
-    return comparator->Compare(key1, key2) <= 0;
-  }
-  const Comparator* comparator = BytewiseComparator();
-};
-
-// When writing to a block that requires entries to be sorted by
-// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
-// before writng to store.
-typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
-
-void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) {
-  assert(props.find(name) == props.end());
-
-  std::string dst;
-  PutVarint64(&dst, val);
-
-  props.insert(
-      std::make_pair(name, dst)
-  );
-}
-
 static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
   // Check to see if compressed less than 12.5%
   return compressed_size < raw_size - (raw_size / 8u);
 }
 
-// Were we encounter any error occurs during user-defined statistics collection,
-// we'll write the warning message to info log.
-void LogPropertiesCollectionError(
-    Logger* info_log, const std::string& method, const std::string& name) {
-  assert(method == "Add" || method == "Finish");
-
-  std::string msg =
-    "[Warning] encountered error when calling TablePropertiesCollector::" +
-    method + "() with collector name: " + name;
-  Log(info_log, "%s", msg.c_str());
-}
-
 }  // anonymous namespace
 
 // kBlockBasedTableMagicNumber was picked by running
@@ -186,16 +151,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
   r->props.raw_key_size += key.size();
   r->props.raw_value_size += value.size();
 
-  for (auto collector : r->options.table_properties_collectors) {
-    Status s = collector->Add(key, value);
-    if (!s.ok()) {
-      LogPropertiesCollectionError(
-          r->options.info_log.get(),
-          "Add", /* method */
-          collector->Name()
-      );
-    }
-  }
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      r->options.table_properties_collectors,
+      r->options.info_log.get()
+  );
 }
 
 void BlockBasedTableBuilder::Flush() {
@@ -389,14 +350,7 @@ Status BlockBasedTableBuilder::Finish() {
   //    2. [meta block: properties]
   //    3. [metaindex block]
   if (ok()) {
-    // We use `BytewiseComparator` as the comparator for meta block.
-    BlockBuilder meta_index_block(
-        r->options.block_restart_interval,
-        BytewiseComparator()
-    );
-    // Key: meta block name
-    // Value: block handle to that meta block
-    BytewiseSortedMap meta_block_handles;
+    MetaIndexBuilder meta_index_builer;
 
     // Write filter block.
     if (r->filter_block != nullptr) {
@@ -404,104 +358,43 @@ Status BlockBasedTableBuilder::Finish() {
       // of filter data.
       std::string key = BlockBasedTable::kFilterBlockPrefix;
       key.append(r->options.filter_policy->Name());
-      std::string handle_encoding;
-      filter_block_handle.EncodeTo(&handle_encoding);
-      meta_block_handles.insert(
-          std::make_pair(key, handle_encoding)
-      );
+      meta_index_builer.Add(key, filter_block_handle);
     }
 
     // Write properties block.
     {
-      BlockBuilder properties_block(
-          r->options.block_restart_interval,
-          BytewiseComparator()
-      );
-
-      BytewiseSortedMap properties;
-
-      // Add basic properties
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kRawKeySize,
-          r->props.raw_key_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kRawValueSize,
-          r->props.raw_value_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kDataSize,
-          r->props.data_size
-      );
+      PropertyBlockBuilder property_block_builder;
+      std::vector<std::string> failed_user_prop_collectors;
+      r->props.filter_policy_name = r->options.filter_policy != nullptr ?
+          r->options.filter_policy->Name() : "";
       r->props.index_size =
         r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kIndexSize,
-          r->props.index_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kNumEntries,
-          r->props.num_entries
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kNumDataBlocks,
-          r->props.num_data_blocks);
-      if (r->filter_block != nullptr) {
-        properties.insert({
-              BlockBasedTablePropertiesNames::kFilterPolicy,
-              r->options.filter_policy->Name()
-        });
-      }
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kFilterSize,
-          r->props.filter_size
-      );
 
-      for (auto collector : r->options.table_properties_collectors) {
-        TableProperties::UserCollectedProperties user_collected_properties;
-        Status s =
-          collector->Finish(&user_collected_properties);
-
-        if (!s.ok()) {
-          LogPropertiesCollectionError(
-              r->options.info_log.get(),
-              "Finish", /* method */
-              collector->Name()
-          );
-        } else {
-          properties.insert(
-              user_collected_properties.begin(),
-              user_collected_properties.end()
-          );
-        }
-      }
+      // Add basic properties
+      property_block_builder.AddTableProperty(r->props);
 
-      for (const auto& stat : properties) {
-        properties_block.Add(stat.first, stat.second);
-      }
+      NotifyCollectTableCollectorsOnFinish(
+          r->options.table_properties_collectors,
+          r->options.info_log.get(),
+          &property_block_builder
+      );
 
       BlockHandle properties_block_handle;
-      WriteBlock(&properties_block, &properties_block_handle);
-
-      std::string handle_encoding;
-      properties_block_handle.EncodeTo(&handle_encoding);
-      meta_block_handles.insert(
-          { BlockBasedTable::kPropertiesBlock, handle_encoding }
+      WriteRawBlock(
+          property_block_builder.Finish(),
+          kNoCompression,
+          &properties_block_handle
       );
-    }  // end of properties block writing
 
-    for (const auto& metablock : meta_block_handles) {
-      meta_index_block.Add(metablock.first, metablock.second);
-    }
+      meta_index_builer.Add(BlockBasedTable::kPropertiesBlock,
+                            properties_block_handle);
+    }  // end of properties block writing
 
-    WriteBlock(&meta_index_block, &metaindex_block_handle);
+    WriteRawBlock(
+        meta_index_builer.Finish(),
+        kNoCompression,
+        &metaindex_block_handle
+    );
   }  // meta blocks and metaindex block.
 
   // Write index block
@@ -563,4 +456,9 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
   return rep_->offset;
 }
 
+const std::string BlockBasedTable::kFilterBlockPrefix =
+    "filter.";
+const std::string BlockBasedTable::kPropertiesBlock =
+    "rocksdb.properties";
+
 }  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 11b8f6ca8..e69bef679 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -432,19 +432,19 @@ Status BlockBasedTable::ReadProperties(
 
   // All pre-defined properties of type uint64_t
   std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
-    { BlockBasedTablePropertiesNames::kDataSize,
+    { TablePropertiesNames::kDataSize,
       &table_properties->data_size },
-    { BlockBasedTablePropertiesNames::kIndexSize,
+    { TablePropertiesNames::kIndexSize,
       &table_properties->index_size },
-    { BlockBasedTablePropertiesNames::kFilterSize,
+    { TablePropertiesNames::kFilterSize,
       &table_properties->filter_size },
-    { BlockBasedTablePropertiesNames::kRawKeySize,
+    { TablePropertiesNames::kRawKeySize,
       &table_properties->raw_key_size },
-    { BlockBasedTablePropertiesNames::kRawValueSize,
+    { TablePropertiesNames::kRawValueSize,
       &table_properties->raw_value_size },
-    { BlockBasedTablePropertiesNames::kNumDataBlocks,
+    { TablePropertiesNames::kNumDataBlocks,
       &table_properties->num_data_blocks },
-    { BlockBasedTablePropertiesNames::kNumEntries,
+    { TablePropertiesNames::kNumEntries,
       &table_properties->num_entries },
   };
 
@@ -478,7 +478,7 @@ Status BlockBasedTable::ReadProperties(
         continue;
       }
       *(pos->second) = val;
-    } else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) {
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
       table_properties->filter_policy_name = raw_val.ToString();
     } else {
       // handle user-collected
@@ -1062,25 +1062,4 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
   return result;
 }
 
-const std::string BlockBasedTable::kFilterBlockPrefix =
-    "filter.";
-const std::string BlockBasedTable::kPropertiesBlock =
-    "rocksdb.properties";
-const std::string BlockBasedTablePropertiesNames::kDataSize  =
-    "rocksdb.data.size";
-const std::string BlockBasedTablePropertiesNames::kIndexSize =
-    "rocksdb.index.size";
-const std::string BlockBasedTablePropertiesNames::kFilterSize =
-    "rocksdb.filter.size";
-const std::string BlockBasedTablePropertiesNames::kRawKeySize =
-    "rocksdb.raw.key.size";
-const std::string BlockBasedTablePropertiesNames::kRawValueSize =
-    "rocksdb.raw.value.size";
-const std::string BlockBasedTablePropertiesNames::kNumDataBlocks =
-    "rocksdb.num.data.blocks";
-const std::string BlockBasedTablePropertiesNames::kNumEntries =
-    "rocksdb.num.entries";
-const std::string BlockBasedTablePropertiesNames::kFilterPolicy =
-    "rocksdb.filter.policy";
-
 }  // namespace rocksdb
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 02bbfd74c..72fb35fa5 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -181,15 +181,4 @@ class BlockBasedTable : public TableReader {
   void operator=(const TableReader&) = delete;
 };
 
-struct BlockBasedTablePropertiesNames {
-  static const std::string kDataSize;
-  static const std::string kIndexSize;
-  static const std::string kFilterSize;
-  static const std::string kRawKeySize;
-  static const std::string kRawValueSize;
-  static const std::string kNumDataBlocks;
-  static const std::string kNumEntries;
-  static const std::string kFilterPolicy;
-};
-
 }  // namespace rocksdb
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
new file mode 100644
index 000000000..df3ee5dae
--- /dev/null
+++ b/table/meta_blocks.cc
@@ -0,0 +1,134 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/meta_blocks.h"
+
+#include <map>
+
+#include "rocksdb/table_properties.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+MetaIndexBuilder::MetaIndexBuilder()
+    : meta_index_block_(
+        new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void MetaIndexBuilder::Add(const std::string& key,
+                           const BlockHandle& handle) {
+  std::string handle_encoding;
+  handle.EncodeTo(&handle_encoding);
+  meta_block_handles_.insert({key, handle_encoding});
+}
+
+Slice MetaIndexBuilder::Finish() {
+  for (const auto& metablock : meta_block_handles_) {
+    meta_index_block_->Add(metablock.first, metablock.second);
+  }
+  return meta_index_block_->Finish();
+}
+
+PropertyBlockBuilder::PropertyBlockBuilder()
+  : properties_block_(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void PropertyBlockBuilder::Add(const std::string& name,
+                               const std::string& val) {
+  props_.insert({name, val});
+}
+
+void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
+  assert(props_.find(name) == props_.end());
+
+  std::string dst;
+  PutVarint64(&dst, val);
+
+  Add(name, dst);
+}
+
+void PropertyBlockBuilder::Add(
+    const UserCollectedProperties& user_collected_properties) {
+  for (const auto& prop : user_collected_properties) {
+    Add(prop.first, prop.second);
+  }
+}
+
+void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
+  Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
+  Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
+  Add(TablePropertiesNames::kDataSize, props.data_size);
+  Add(TablePropertiesNames::kIndexSize, props.index_size);
+  Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
+  Add(TablePropertiesNames::kFilterSize, props.filter_size);
+
+  if (!props.filter_policy_name.empty()) {
+    Add(TablePropertiesNames::kFilterPolicy,
+        props.filter_policy_name);
+  }
+}
+
+Slice PropertyBlockBuilder::Finish() {
+  for (const auto& prop : props_) {
+    properties_block_->Add(prop.first, prop.second);
+  }
+
+  return properties_block_->Finish();
+}
+
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name) {
+  assert(method == "Add" || method == "Finish");
+
+  std::string msg =
+    "[Warning] encountered error when calling TablePropertiesCollector::" +
+    method + "() with collector name: " + name;
+  Log(info_log, "%s", msg.c_str());
+}
+
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    Status s = collector->Add(key, value);
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(
+          info_log, "Add", /* method */ collector->Name()
+      );
+    }
+  }
+  return all_succeeded;
+}
+
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    UserCollectedProperties user_collected_properties;
+    Status s = collector->Finish(&user_collected_properties);
+
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(
+          info_log, "Finish", /* method */ collector->Name()
+      );
+    } else {
+      builder->Add(user_collected_properties);
+    }
+  }
+
+  return all_succeeded;
+}
+
+}  // namespace rocksdb
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
new file mode 100644
index 000000000..d0718ec07
--- /dev/null
+++ b/table/meta_blocks.h
@@ -0,0 +1,106 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "table/block_builder.h"
+
+namespace rocksdb {
+
+class BlockHandle;
+class BlockBuilder;
+class Logger;
+struct TableProperties;
+
+// An STL style comparator that does the bytewise comparator comparasion
+// internally.
+struct BytewiseLessThan {
+  bool operator()(const std::string& key1, const std::string& key2) const {
+    // smaller entries will be placed in front.
+    return comparator->Compare(key1, key2) <= 0;
+  }
+
+  const Comparator* comparator = BytewiseComparator();
+};
+
+// When writing to a block that requires entries to be sorted by
+// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
+// before writng to store.
+typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
+
+class MetaIndexBuilder {
+ public:
+  MetaIndexBuilder(const MetaIndexBuilder&) = delete;
+  MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
+
+  MetaIndexBuilder();
+  void Add(const std::string& key, const BlockHandle& handle);
+
+  // Write all the added key/value pairs to the block and return the contents
+  // of the block.
+  Slice Finish();
+
+ private:
+  //   * Key: meta block name
+  //   * Value: block handle to that meta block
+  struct Rep;
+  Rep* rep_;
+
+  // store the sorted key/handle of the metablocks.
+  BytewiseSortedMap meta_block_handles_;
+  std::unique_ptr<BlockBuilder> meta_index_block_;
+};
+
+class PropertyBlockBuilder {
+ public:
+  PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
+  PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
+
+  PropertyBlockBuilder();
+
+  void AddTableProperty(const TableProperties& props);
+  void Add(const std::string& key, uint64_t value);
+  void Add(const std::string& key, const std::string& value);
+  void Add(const UserCollectedProperties& user_collected_properties);
+
+  // Write all the added entries to the block and return the block contents
+  Slice Finish();
+
+ private:
+  std::unique_ptr<BlockBuilder> properties_block_;
+  BytewiseSortedMap props_;
+};
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name);
+
+// Utility functions help table builder to trigger batch events for user
+// defined property collectors.
+// Return value indicates if there is any error occurred; if error occurred,
+// the warning message will be logged.
+// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
+// property collectors.
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log);
+
+// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
+// property collectors. The collected properties will be added to `builder`.
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder);
+
+}  // namespace rocksdb
diff --git a/table/table_properties.cc b/table/table_properties.cc
new file mode 100644
index 000000000..2c9905884
--- /dev/null
+++ b/table/table_properties.cc
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+namespace {
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const std::string& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    props.append(key);
+    props.append(kv_delim);
+    props.append(value);
+    props.append(prop_delim);
+  }
+
+  template <class TValue>
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const TValue& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    AppendProperty(
+        props, key, std::to_string(value), prop_delim, kv_delim
+    );
+  }
+}
+
+std::string TableProperties::ToString(
+    const std::string& prop_delim,
+    const std::string& kv_delim) const {
+  std::string result;
+  result.reserve(1024);
+
+  // Basic Info
+  AppendProperty(
+      result, "# data blocks", num_data_blocks, prop_delim, kv_delim
+  );
+  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+
+  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+  AppendProperty(
+      result,
+      "raw average key size",
+      num_entries != 0 ?  1.0 * raw_key_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+  AppendProperty(
+      result, "raw value size", raw_value_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "raw average value size",
+      num_entries != 0 ?  1.0 * raw_value_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
+  AppendProperty(
+      result, "filter block size", filter_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "(estimated) table size",
+      data_size + index_size + filter_size,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(
+      result,
+      "filter policy name",
+      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+      prop_delim,
+      kv_delim
+  );
+
+  return result;
+}
+
+const std::string TablePropertiesNames::kDataSize  =
+    "rocksdb.data.size";
+const std::string TablePropertiesNames::kIndexSize =
+    "rocksdb.index.size";
+const std::string TablePropertiesNames::kFilterSize =
+    "rocksdb.filter.size";
+const std::string TablePropertiesNames::kRawKeySize =
+    "rocksdb.raw.key.size";
+const std::string TablePropertiesNames::kRawValueSize =
+    "rocksdb.raw.value.size";
+const std::string TablePropertiesNames::kNumDataBlocks =
+    "rocksdb.num.data.blocks";
+const std::string TablePropertiesNames::kNumEntries =
+    "rocksdb.num.entries";
+const std::string TablePropertiesNames::kFilterPolicy =
+    "rocksdb.filter.policy";
+
+}  // namespace rocksdb

From ef2211a9ca449ea2c3416ba4e1b8012f100d2d46 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 2 Dec 2013 18:34:05 -0800
Subject: [PATCH 19/70] [RocksDB Performance Branch] Introduce MergeContext to
 Lazily Initialize merge operand list

Summary: In get operations, merge_operands is only used in few cases. Lazily initialize it can reduce average latency in some cases

Test Plan: make all check

Reviewers: haobo, kailiu, dhruba

Reviewed By: haobo

CC: igor, nkg-, leveldb

Differential Revision: https://reviews.facebook.net/D14415
---
 db/db_impl.cc                    | 21 +++++-----
 db/db_impl_readonly.cc           |  8 ++--
 db/memtable.cc                   | 32 +++++++--------
 db/memtable.h                    |  3 +-
 db/memtablelist.cc               |  5 +--
 db/memtablelist.h                |  2 +-
 db/merge_context.h               | 69 ++++++++++++++++++++++++++++++++
 db/version_set.cc                | 34 +++++++++-------
 db/version_set.h                 |  7 ++--
 include/rocksdb/merge_operator.h |  1 +
 10 files changed, 129 insertions(+), 53 deletions(-)
 create mode 100644 db/merge_context.h

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 436b0c9d7..6875cb37c 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -27,6 +27,7 @@
 #include "db/log_writer.h"
 #include "db/memtable.h"
 #include "db/memtablelist.h"
+#include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/prefix_filter_iterator.h"
 #include "db/table_cache.h"
@@ -2602,22 +2603,22 @@ Status DBImpl::GetImpl(const ReadOptions& options,
 
 
   // Prepare to store a list of merge operations if merge occurs.
-  std::deque<std::string> merge_operands;
+  MergeContext merge_context;
 
   // First look in the memtable, then in the immutable memtable (if any).
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
   LookupKey lkey(key, snapshot);
   BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
-  if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
+  if (mem->Get(lkey, value, &s, merge_context, options_)) {
     // Done
-  } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) {
+  } else if (imm.Get(lkey, value, &s, merge_context, options_)) {
     // Done
   } else {
     StopWatchNano from_files_timer(env_, false);
     StartPerfTimer(&from_files_timer);
 
-    current->Get(options, lkey, value, &s, &merge_operands, &stats,
+    current->Get(options, lkey, value, &s, &merge_context, &stats,
                  options_, value_found);
     have_stat_update = true;
     BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
@@ -2680,8 +2681,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
   bool have_stat_update = false;
   Version::GetStats stats;
 
-  // Prepare to store a list of merge operations if merge occurs.
-  std::deque<std::string> merge_operands;
+  // Contain a list of merge operations if merge occurs.
+  MergeContext merge_context;
 
   // Note: this always resizes the values array
   int numKeys = keys.size();
@@ -2697,17 +2698,17 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
   for (int i=0; i<numKeys; ++i) {
-    merge_operands.clear();
+    merge_context.Clear();
     Status& s = statList[i];
     std::string* value = &(*values)[i];
 
     LookupKey lkey(keys[i], snapshot);
-    if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
+    if (mem->Get(lkey, value, &s, merge_context, options_)) {
       // Done
-    } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) {
+    } else if (imm.Get(lkey, value, &s, merge_context, options_)) {
       // Done
     } else {
-      current->Get(options, lkey, value, &s, &merge_operands, &stats, options_);
+      current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
       have_stat_update = true;
     }
 
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 27d5c31ed..dbb297e93 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -23,6 +23,7 @@
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/memtable.h"
+#include "db/merge_context.h"
 #include "db/table_cache.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
@@ -30,6 +31,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "rocksdb/merge_operator.h"
 #include "port/port.h"
 #include "table/block.h"
 #include "table/merger.h"
@@ -57,12 +59,12 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
   MemTable* mem = GetMemTable();
   Version* current = versions_->current();
   SequenceNumber snapshot = versions_->LastSequence();
-  std::deque<std::string> merge_operands;
+  MergeContext merge_context;
   LookupKey lkey(key, snapshot);
-  if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
+  if (mem->Get(lkey, value, &s, merge_context, options_)) {
   } else {
     Version::GetStats stats;
-    current->Get(options, lkey, value, &s, &merge_operands, &stats, options_);
+    current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
   }
   return s;
 }
diff --git a/db/memtable.cc b/db/memtable.cc
index 082d468d7..b4df915e0 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -12,6 +12,7 @@
 #include <memory>
 
 #include "db/dbformat.h"
+#include "db/merge_context.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -163,7 +164,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
 }
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
-                   std::deque<std::string>* operands, const Options& options) {
+                   MergeContext& merge_context, const Options& options) {
   StopWatchNano memtable_get_timer(options.env, false);
   StartPerfTimer(&memtable_get_timer);
 
@@ -172,9 +173,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     table_->GetIterator(key.user_key()));
   iter->Seek(key.user_key(), memkey.data());
 
-  // It is the caller's responsibility to allocate/delete operands list
-  assert(operands != nullptr);
-
   bool merge_in_progress = s->IsMergeInProgress();
   auto merge_operator = options.merge_operator.get();
   auto logger = options.info_log;
@@ -207,8 +205,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           *s = Status::OK();
           if (merge_in_progress) {
             assert(merge_operator);
-            if (!merge_operator->FullMerge(key.user_key(), &v, *operands,
-                                           value, logger.get())) {
+          if (!merge_operator->FullMerge(key.user_key(), &v,
+                                         merge_context.GetOperands(), value,
+                                         logger.get())) {
               RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES);
               *s = Status::Corruption("Error: Could not perform merge.");
             }
@@ -225,8 +224,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           if (merge_in_progress) {
             assert(merge_operator);
             *s = Status::OK();
-            if (!merge_operator->FullMerge(key.user_key(), nullptr, *operands,
-                                           value, logger.get())) {
+          if (!merge_operator->FullMerge(key.user_key(), nullptr,
+                                         merge_context.GetOperands(), value,
+                                         logger.get())) {
               RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES);
               *s = Status::Corruption("Error: Could not perform merge.");
             }
@@ -239,16 +239,14 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
         case kTypeMerge: {
           Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
           merge_in_progress = true;
-          operands->push_front(v.ToString());
-          while(operands->size() >= 2) {
+          merge_context.PushOperand(v);
+          while(merge_context.GetNumOperands() >= 2) {
             // Attempt to associative merge. (Returns true if successful)
-            if (merge_operator->PartialMerge(key.user_key(),
-                                             Slice((*operands)[0]),
-                                             Slice((*operands)[1]),
-                                             &merge_result,
-                                             logger.get())) {
-              operands->pop_front();
-              swap(operands->front(), merge_result);
+          if (merge_operator->PartialMerge(key.user_key(),
+                                           merge_context.GetOperand(0),
+                                           merge_context.GetOperand(1),
+                                           &merge_result, logger.get())) {
+              merge_context.PushPartialMergeResult(merge_result);
             } else {
               // Stack them because user can't associative merge
               break;
diff --git a/db/memtable.h b/db/memtable.h
index 751de3186..8a3a8610c 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -22,6 +22,7 @@ namespace rocksdb {
 
 class Mutex;
 class MemTableIterator;
+class MergeContext;
 
 class MemTable {
  public:
@@ -94,7 +95,7 @@ class MemTable {
   //   store MergeInProgress in s, and return false.
   // Else, return false.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           std::deque<std::string>* operands, const Options& options);
+           MergeContext& merge_context, const Options& options);
 
   // Update the value and return status ok,
   //   if key exists in current memtable
diff --git a/db/memtablelist.cc b/db/memtablelist.cc
index 4453d1721..48725590b 100644
--- a/db/memtablelist.cc
+++ b/db/memtablelist.cc
@@ -201,10 +201,9 @@ size_t MemTableList::ApproximateMemoryUsage() {
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s,
-                       std::deque<std::string>* operands,
-                       const Options& options) {
+                       MergeContext& merge_context, const Options& options) {
   for (auto &memtable : memlist_) {
-    if (memtable->Get(key, value, s, operands, options)) {
+    if (memtable->Get(key, value, s, merge_context, options)) {
       return true;
     }
   }
diff --git a/db/memtablelist.h b/db/memtablelist.h
index ef10526c9..5f36752f4 100644
--- a/db/memtablelist.h
+++ b/db/memtablelist.h
@@ -77,7 +77,7 @@ class MemTableList {
   // Search all the memtables starting from the most recent one.
   // Return the most recent value found, if any.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           std::deque<std::string>* operands, const Options& options);
+           MergeContext& merge_context, const Options& options);
 
   // Returns the list of underlying memtables.
   void GetMemTables(std::vector<MemTable*>* list);
diff --git a/db/merge_context.h b/db/merge_context.h
new file mode 100644
index 000000000..91d9f8a01
--- /dev/null
+++ b/db/merge_context.h
@@ -0,0 +1,69 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+const std::deque<std::string> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+public:
+  // Clear all the operands
+  void Clear() {
+    if (operand_list) {
+      operand_list->clear();
+    }
+  }
+  // Replace the first two operands of merge_result, which are expected be the
+  // merge results of them.
+  void PushPartialMergeResult(std::string& merge_result) {
+    assert (operand_list);
+    operand_list->pop_front();
+    swap(operand_list->front(), merge_result);
+  }
+  // Push a merge operand
+  void PushOperand(const Slice& operand_slice) {
+    Initialize();
+    operand_list->push_front(operand_slice.ToString());
+  }
+  // return total number of operands in the list
+  size_t GetNumOperands() const {
+    if (!operand_list) {
+      return 0;
+    }
+    return operand_list->size();
+  }
+  // Get the operand at the index.
+  Slice GetOperand(int index) const {
+    assert (operand_list);
+    return (*operand_list)[index];
+  }
+  // Return all the operands.
+  const std::deque<std::string>& GetOperands() const {
+    if (!operand_list) {
+      return empty_operand_list;
+    }
+    return *operand_list;
+  }
+private:
+  void Initialize() {
+    if (!operand_list) {
+      operand_list.reset(new std::deque<std::string>());
+    }
+  }
+  std::unique_ptr<std::deque<std::string>> operand_list;
+};
+
+} // namespace rocksdb
+
diff --git a/db/version_set.cc b/db/version_set.cc
index 3d4f84484..79b53af45 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -16,6 +16,7 @@
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/memtable.h"
+#include "db/merge_context.h"
 #include "db/table_cache.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
@@ -287,7 +288,8 @@ struct Saver {
   bool* value_found; // Is value set correctly? Used by KeyMayExist
   std::string* value;
   const MergeOperator* merge_operator;
-  std::deque<std::string>* merge_operands;  // the merge operations encountered
+  // the merge operations encountered;
+  MergeContext* merge_context;
   Logger* logger;
   bool didIO;    // did we do any disk io?
   Statistics* statistics;
@@ -309,10 +311,10 @@ static void MarkKeyMayExist(void* arg) {
 
 static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
   Saver* s = reinterpret_cast<Saver*>(arg);
-  std::deque<std::string>* const ops = s->merge_operands; // shorter alias
+  MergeContext* merge_contex = s->merge_context;
   std::string merge_result;  // temporary area for merge results later
 
-  assert(s != nullptr && ops != nullptr);
+  assert(s != nullptr && merge_contex != nullptr);
 
   ParsedInternalKey parsed_key;
   // TODO: didIO and Merge?
@@ -331,7 +333,8 @@ static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
           } else if (kMerge == s->state) {
             assert(s->merge_operator != nullptr);
             s->state = kFound;
-            if (!s->merge_operator->FullMerge(s->user_key, &v, *ops,
+            if (!s->merge_operator->FullMerge(s->user_key, &v,
+                                              merge_contex->GetOperands(),
                                               s->value, s->logger)) {
               RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
               s->state = kCorrupt;
@@ -346,8 +349,9 @@ static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
             s->state = kDeleted;
           } else if (kMerge == s->state) {
             s->state = kFound;
-            if (!s->merge_operator->FullMerge(s->user_key, nullptr, *ops,
-                                              s->value, s->logger)) {
+          if (!s->merge_operator->FullMerge(s->user_key, nullptr,
+                                            merge_contex->GetOperands(),
+                                            s->value, s->logger)) {
               RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
               s->state = kCorrupt;
             }
@@ -359,16 +363,15 @@ static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
         case kTypeMerge:
           assert(s->state == kNotFound || s->state == kMerge);
           s->state = kMerge;
-          ops->push_front(v.ToString());
-          while (ops->size() >= 2) {
+          merge_contex->PushOperand(v);
+          while (merge_contex->GetNumOperands() >= 2) {
             // Attempt to merge operands together via user associateive merge
             if (s->merge_operator->PartialMerge(s->user_key,
-                                                Slice((*ops)[0]),
-                                                Slice((*ops)[1]),
+                                                merge_contex->GetOperand(0),
+                                                merge_contex->GetOperand(1),
                                                 &merge_result,
                                                 s->logger)) {
-              ops->pop_front();
-              swap(ops->front(), merge_result);
+              merge_contex->PushPartialMergeResult(merge_result);
             } else {
               // Associative merge returns false ==> stack the operands
               break;
@@ -417,7 +420,7 @@ void Version::Get(const ReadOptions& options,
                   const LookupKey& k,
                   std::string* value,
                   Status* status,
-                  std::deque<std::string>* operands,
+                  MergeContext* merge_context,
                   GetStats* stats,
                   const Options& db_options,
                   bool* value_found) {
@@ -436,7 +439,7 @@ void Version::Get(const ReadOptions& options,
   saver.value_found = value_found;
   saver.value = value;
   saver.merge_operator = merge_operator;
-  saver.merge_operands = operands;
+  saver.merge_context = merge_context;
   saver.logger = logger.get();
   saver.didIO = false;
   saver.statistics = db_options.statistics.get();
@@ -564,7 +567,8 @@ void Version::Get(const ReadOptions& options,
   if (kMerge == saver.state) {
     // merge_operands are in saver and we hit the beginning of the key history
     // do a final merge of nullptr and operands;
-    if (merge_operator->FullMerge(user_key, nullptr, *saver.merge_operands,
+    if (merge_operator->FullMerge(user_key, nullptr,
+                                  saver.merge_context->GetOperands(),
                                   value, logger.get())) {
       *status = Status::OK();
     } else {
diff --git a/db/version_set.h b/db/version_set.h
index 38415173c..bf466a932 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -38,6 +38,7 @@ class MemTable;
 class TableCache;
 class Version;
 class VersionSet;
+class MergeContext;
 
 // Return the smallest index i such that files[i]->largest >= key.
 // Return files.size() if there is no such file.
@@ -76,9 +77,9 @@ class Version {
     int seek_file_level;
   };
   void Get(const ReadOptions&, const LookupKey& key, std::string* val,
-           Status* status, std::deque<std::string>* operands, GetStats* stats,
-           const Options& db_option,
-           bool* value_found = nullptr);
+           Status* status, MergeContext* merge_context,
+           GetStats* stats, const Options& db_option, bool* value_found =
+               nullptr);
 
   // Adds "stats" into the current state.  Returns true if a new
   // compaction may need to be triggered, false otherwise.
diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h
index ddb3102e3..bd4c36c07 100644
--- a/include/rocksdb/merge_operator.h
+++ b/include/rocksdb/merge_operator.h
@@ -6,6 +6,7 @@
 #ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
 #define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
 
+#include <memory>
 #include <string>
 #include <deque>
 #include "rocksdb/slice.h"

From c7707f24c2cbdab7af6e62022dcc8341033afce0 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Fri, 6 Dec 2013 16:51:35 -0800
Subject: [PATCH 20/70] Refine the statistics

---
 include/rocksdb/statistics.h | 92 +++++++++++++++++-------------------
 util/statistics_imp.h        |  2 +-
 2 files changed, 45 insertions(+), 49 deletions(-)

diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 286a624c8..bd5d485de 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -7,7 +7,6 @@
 #define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
 
 #include <atomic>
-#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -18,10 +17,8 @@ namespace rocksdb {
 
 /**
  * Keep adding ticker's here.
- * Any ticker should have a value less than TICKER_ENUM_MAX.
- * Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
- * Add a string representation in TickersNameMap below.
- * And incrementing TICKER_ENUM_MAX.
+ *  1. Any ticker should be added before TICKER_ENUM_MAX.
+ *  2. Add a readable string in TickersNameMap below for the newly added ticker.
  */
 enum Tickers {
   // total block cache misses
@@ -120,46 +117,46 @@ enum Tickers {
 // The order of items listed in  Tickers should be the same as
 // the order listed in TickersNameMap
 const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
-  { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
-  { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
-  { BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
-  { BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
-  { BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
-  { BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
-  { BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
-  { BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
-  { BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
-  { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
-  { MEMTABLE_HIT, "rocksdb.memtable.hit" },
-  { MEMTABLE_MISS, "rocksdb.memtable.miss" },
-  { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
-  { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
-  { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
-  { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
-  { NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
-  { NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
-  { BYTES_WRITTEN, "rocksdb.bytes.written" },
-  { BYTES_READ, "rocksdb.bytes.read" },
-  { NO_FILE_CLOSES, "rocksdb.no.file.closes" },
-  { NO_FILE_OPENS, "rocksdb.no.file.opens" },
-  { NO_FILE_ERRORS, "rocksdb.no.file.errors" },
-  { STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
-  { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
-  { STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
-  { RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
-  { NO_ITERATORS, "rocksdb.num.iterators" },
-  { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
-  { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
-  { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
-  { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
-  { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
-  { SEQUENCE_NUMBER, "rocksdb.sequence.number" },
-  { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
-  { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
-  { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
-  { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
-  { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
-  { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }
+  { BLOCK_CACHE_MISS,                 "rocksdb.block.cache.miss"             },
+  { BLOCK_CACHE_HIT,                  "rocksdb.block.cache.hit"              },
+  { BLOCK_CACHE_ADD,                  "rocksdb.block.cache.add"              },
+  { BLOCK_CACHE_INDEX_MISS,           "rocksdb.block.cache.index.miss"       },
+  { BLOCK_CACHE_INDEX_HIT,            "rocksdb.block.cache.index.hit"        },
+  { BLOCK_CACHE_FILTER_MISS,          "rocksdb.block.cache.filter.miss"      },
+  { BLOCK_CACHE_FILTER_HIT,           "rocksdb.block.cache.filter.hit"       },
+  { BLOCK_CACHE_DATA_MISS,            "rocksdb.block.cache.data.miss"        },
+  { BLOCK_CACHE_DATA_HIT,             "rocksdb.block.cache.data.hit"         },
+  { BLOOM_FILTER_USEFUL,              "rocksdb.bloom.filter.useful"          },
+  { MEMTABLE_HIT,                     "rocksdb.memtable.hit"                 },
+  { MEMTABLE_MISS,                    "rocksdb.memtable.miss"                },
+  { COMPACTION_KEY_DROP_NEWER_ENTRY,  "rocksdb.compaction.key.drop.new"      },
+  { COMPACTION_KEY_DROP_OBSOLETE,     "rocksdb.compaction.key.drop.obsolete" },
+  { COMPACTION_KEY_DROP_USER,         "rocksdb.compaction.key.drop.user"     },
+  { NUMBER_KEYS_WRITTEN,              "rocksdb.number.keys.written"          },
+  { NUMBER_KEYS_READ,                 "rocksdb.number.keys.read"             },
+  { NUMBER_KEYS_UPDATED,              "rocksdb.number.keys.updated"          },
+  { BYTES_WRITTEN,                    "rocksdb.bytes.written"                },
+  { BYTES_READ,                       "rocksdb.bytes.read"                   },
+  { NO_FILE_CLOSES,                   "rocksdb.no.file.closes"               },
+  { NO_FILE_OPENS,                    "rocksdb.no.file.opens"                },
+  { NO_FILE_ERRORS,                   "rocksdb.no.file.errors"               },
+  { STALL_L0_SLOWDOWN_MICROS,         "rocksdb.l0.slowdown.micros"           },
+  { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"   },
+  { STALL_L0_NUM_FILES_MICROS,        "rocksdb.l0.num.files.stall.micros"    },
+  { RATE_LIMIT_DELAY_MILLIS,          "rocksdb.rate.limit.delay.millis"      },
+  { NO_ITERATORS,                     "rocksdb.num.iterators"                },
+  { NUMBER_MULTIGET_CALLS,            "rocksdb.number.multiget.get"          },
+  { NUMBER_MULTIGET_KEYS_READ,        "rocksdb.number.multiget.keys.read"    },
+  { NUMBER_MULTIGET_BYTES_READ,       "rocksdb.number.multiget.bytes.read"   },
+  { NUMBER_FILTERED_DELETES,          "rocksdb.number.deletes.filtered"      },
+  { NUMBER_MERGE_FAILURES,            "rocksdb.number.merge.failures"        },
+  { SEQUENCE_NUMBER,                  "rocksdb.sequence.number"              },
+  { BLOOM_FILTER_PREFIX_CHECKED,      "rocksdb.bloom.filter.prefix.checked"  },
+  { BLOOM_FILTER_PREFIX_USEFUL,       "rocksdb.bloom.filter.prefix.useful"   },
+  { NUMBER_OF_RESEEKS_IN_ITERATION,   "rocksdb.number.reseeks.iteration"     },
+  { GET_UPDATES_SINCE_CALLS,          "rocksdb.getupdatessince.calls"        },
+  { BLOCK_CACHE_COMPRESSED_MISS,      "rocksdb.block.cachecompressed.miss"   },
+  { BLOCK_CACHE_COMPRESSED_HIT,       "rocksdb.block.cachecompressed.hit"    },
 };
 
 /**
@@ -254,7 +251,7 @@ class Ticker {
     count_ = count;
   }
 
-  inline void recordTick(int count = 1) {
+  inline void recordTick(int64_t count = 1) {
     count_ += count;
   }
 
@@ -269,13 +266,12 @@ class Ticker {
 // Analyze the performance of a db
 class Statistics {
  public:
-
   virtual long getTickerCount(Tickers tickerType) = 0;
   virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
   virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
   virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
 
-  virtual void histogramData(Histograms type, HistogramData * const data) = 0;
+  virtual void histogramData(Histograms type, HistogramData* const data) = 0;
   // String representation of the statistic object.
   std::string ToString();
 };
diff --git a/util/statistics_imp.h b/util/statistics_imp.h
index 0dc8884c1..3c8386371 100644
--- a/util/statistics_imp.h
+++ b/util/statistics_imp.h
@@ -11,7 +11,7 @@ namespace rocksdb {
 // Utility functions
 inline void RecordTick(Statistics* statistics,
                        Tickers ticker,
-                       uint64_t count = 1) {
+                       int64_t count = 1) {
   assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
   assert(TickersNameMap.size() == TICKER_ENUM_MAX);
   if (statistics) {

From 3a7c5bd40a3ff2bb72d334ad6d88dd496c8c2481 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 9 Dec 2013 10:44:21 -0800
Subject: [PATCH 21/70] Added branch detection in perf branch

---
 build_tools/regression_build_test.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
index 1c44e5ad2..b0140ef48 100755
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@@ -26,6 +26,12 @@ function cleanup {
 }
 
 trap cleanup EXIT
+git_br=$(basename $GIT_BRANCH)
+if [ $git_br == "master" ]; then
+  git_br=""
+else
+  git_br="."$git_br
+fi
 
 make clean
 OPT=-DNDEBUG make db_bench -j$(nproc)
@@ -150,7 +156,7 @@ function send_to_ods {
     echo >&2 "ERROR: Key $key doesn't have a value."
     return
   fi
-  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
+  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
     --connect-timeout 60
 }
 

From c79e5954716f5276d37ec7f4b2634a392ff2acca Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Tue, 10 Dec 2013 17:34:35 -0800
Subject: [PATCH 22/70] Make Cache::GetCapacity constant

Summary: This will allow us to access constant via `DB::GetOptions().table_cache.GetCapacity()` or `DB::GetOptions().block_cache.GetCapacity()` since GetOptions() is also constant method.
---
 include/rocksdb/cache.h | 2 +-
 util/cache.cc           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 3e0e5c1cd..201d82f19 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -102,7 +102,7 @@ class Cache {
   virtual uint64_t NewId() = 0;
 
   // returns the maximum configured capacity of the cache
-  virtual size_t GetCapacity() = 0;
+  virtual size_t GetCapacity() const = 0;
 
  private:
   void LRU_Remove(Handle* e);
diff --git a/util/cache.cc b/util/cache.cc
index deec52864..b9d41be49 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -404,7 +404,7 @@ class ShardedLRUCache : public Cache {
     MutexLock l(&id_mutex_);
     return ++(last_id_);
   }
-  virtual size_t GetCapacity() {
+  virtual size_t GetCapacity() const {
     return capacity_;
   }
 };

From a82f42b76545c7e212e9bdb16f4c9fb769ffb127 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Tue, 10 Dec 2013 19:03:13 -0800
Subject: [PATCH 23/70] rename db/memtablelist.{h,cc}

---
 db/db_impl.cc                            | 2 +-
 db/db_impl.h                             | 2 +-
 db/{memtablelist.cc => memtable_list.cc} | 2 +-
 db/{memtablelist.h => memtable_list.h}   | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename db/{memtablelist.cc => memtable_list.cc} (99%)
 rename db/{memtablelist.h => memtable_list.h} (100%)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index b1d016ffd..d29449c4e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -26,7 +26,7 @@
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/memtable.h"
-#include "db/memtablelist.h"
+#include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/prefix_filter_iterator.h"
diff --git a/db/db_impl.h b/db/db_impl.h
index d7a346b6e..8fd141a21 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -21,7 +21,7 @@
 #include "rocksdb/transaction_log.h"
 #include "port/port.h"
 #include "util/stats_logger.h"
-#include "memtablelist.h"
+#include "memtable_list.h"
 
 namespace rocksdb {
 
diff --git a/db/memtablelist.cc b/db/memtable_list.cc
similarity index 99%
rename from db/memtablelist.cc
rename to db/memtable_list.cc
index 71e4e5a92..27e12b945 100644
--- a/db/memtablelist.cc
+++ b/db/memtable_list.cc
@@ -3,7 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
-#include "db/memtablelist.h"
+#include "db/memtable_list.h"
 
 #include <string>
 #include "rocksdb/db.h"
diff --git a/db/memtablelist.h b/db/memtable_list.h
similarity index 100%
rename from db/memtablelist.h
rename to db/memtable_list.h

From 3c02c363b3971ca937f01c9c575ae99873c852b8 Mon Sep 17 00:00:00 2001
From: Haobo Xu <haobo@fb.com>
Date: Wed, 27 Nov 2013 14:27:02 -0800
Subject: [PATCH 24/70] [RocksDB] [Performance Branch] Added dynamic bloom, to
 be used for memable non-existing key filtering

Summary: as title

Test Plan: dynamic_bloom_test

Reviewers: dhruba, sdong, kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14385
---
 Makefile                      |   4 ++
 db/memtable.cc                |  92 ++++++++++++++++++++-------
 db/memtable.h                 |   4 ++
 db/prefix_test.cc             |  30 ++++++---
 include/rocksdb/memtablerep.h |   2 +-
 include/rocksdb/options.h     |   8 +++
 util/bloom_test.cc            |   8 ++-
 util/dynamic_bloom.cc         |  63 +++++++++++++++++++
 util/dynamic_bloom.h          |  42 +++++++++++++
 util/dynamic_bloom_test.cc    | 113 ++++++++++++++++++++++++++++++++++
 util/hash_skiplist_rep.cc     |   4 +-
 util/options.cc               |   9 ++-
 12 files changed, 346 insertions(+), 33 deletions(-)
 create mode 100644 util/dynamic_bloom.cc
 create mode 100644 util/dynamic_bloom.h
 create mode 100644 util/dynamic_bloom_test.cc

diff --git a/Makefile b/Makefile
index 62f180846..826e1cd60 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,7 @@ TESTS = \
 	auto_roll_logger_test \
 	block_test \
 	bloom_test \
+	dynamic_bloom_test \
 	c_test \
 	cache_test \
 	coding_test \
@@ -228,6 +229,9 @@ table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJEC
 bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/memtable.cc b/db/memtable.cc
index f396bc082..9b5df942d 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -52,7 +52,14 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       mem_logfile_number_(0),
       locks_(options.inplace_update_support
              ? options.inplace_update_num_locks
-             : 0) { }
+             : 0),
+      prefix_extractor_(options.prefix_extractor) {
+
+  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0)  {
+    prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
+                                         options.memtable_prefix_bloom_probes));
+  }
+}
 
 MemTable::~MemTable() {
   assert(refs_ == 0);
@@ -88,27 +95,53 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
 
 class MemTableIterator: public Iterator {
  public:
-  MemTableIterator(MemTableRep* table, const ReadOptions& options)
-    : iter_() {
+  MemTableIterator(const MemTable& mem, const ReadOptions& options)
+      : mem_(mem), iter_(), dynamic_prefix_seek_(false), valid_(false) {
     if (options.prefix) {
-      iter_ = table->GetPrefixIterator(*options.prefix);
+      iter_ = mem_.table_->GetPrefixIterator(*options.prefix);
     } else if (options.prefix_seek) {
-      iter_ = table->GetDynamicPrefixIterator();
+      dynamic_prefix_seek_ = true;
+      iter_ = mem_.table_->GetDynamicPrefixIterator();
     } else {
-      iter_ = table->GetIterator();
+      iter_ = mem_.table_->GetIterator();
     }
   }
 
-  virtual bool Valid() const { return iter_->Valid(); }
-  virtual void Seek(const Slice& k) { iter_->Seek(k, nullptr); }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void SeekToLast() { iter_->SeekToLast(); }
-  virtual void Next() { iter_->Next(); }
-  virtual void Prev() { iter_->Prev(); }
+  virtual bool Valid() const { return valid_; }
+  virtual void Seek(const Slice& k) {
+    if (dynamic_prefix_seek_ && mem_.prefix_bloom_ &&
+        !mem_.prefix_bloom_->MayContain(
+          mem_.prefix_extractor_->Transform(ExtractUserKey(k)))) {
+      valid_ = false;
+      return;
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToFirst() {
+    iter_->SeekToFirst();
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToLast() {
+    iter_->SeekToLast();
+    valid_ = iter_->Valid();
+  }
+  virtual void Next() {
+    assert(Valid());
+    iter_->Next();
+    valid_ = iter_->Valid();
+  }
+  virtual void Prev() {
+    assert(Valid());
+    iter_->Prev();
+    valid_ = iter_->Valid();
+  }
   virtual Slice key() const {
+    assert(Valid());
     return GetLengthPrefixedSlice(iter_->key());
   }
   virtual Slice value() const {
+    assert(Valid());
     Slice key_slice = GetLengthPrefixedSlice(iter_->key());
     return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
   }
@@ -116,7 +149,10 @@ class MemTableIterator: public Iterator {
   virtual Status status() const { return Status::OK(); }
 
  private:
+  const MemTable& mem_;
   std::shared_ptr<MemTableRep::Iterator> iter_;
+  bool dynamic_prefix_seek_;
+  bool valid_;
 
   // No copying allowed
   MemTableIterator(const MemTableIterator&);
@@ -124,7 +160,7 @@ class MemTableIterator: public Iterator {
 };
 
 Iterator* MemTable::NewIterator(const ReadOptions& options) {
-  return new MemTableIterator(table_.get(), options);
+  return new MemTableIterator(*this, options);
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
@@ -132,7 +168,7 @@ port::RWMutex* MemTable::GetLock(const Slice& key) {
 }
 
 void MemTable::Add(SequenceNumber s, ValueType type,
-                   const Slice& key,
+                   const Slice& key, /* user key */
                    const Slice& value) {
   // Format of an entry is concatenation of:
   //  key_size     : varint32 of internal_key.size()
@@ -156,6 +192,11 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   assert((p + val_size) - buf == (unsigned)encoded_len);
   table_->Insert(buf);
 
+  if (prefix_bloom_) {
+    assert(prefix_extractor_);
+    prefix_bloom_->Add(prefix_extractor_->Transform(key));
+  }
+
   // The first sequence number inserted into the memtable
   assert(first_seqno_ == 0 || s > first_seqno_);
   if (first_seqno_ == 0) {
@@ -168,10 +209,17 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   StopWatchNano memtable_get_timer(options.env, false);
   StartPerfTimer(&memtable_get_timer);
 
-  Slice memkey = key.memtable_key();
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(key.user_key()));
-  iter->Seek(key.user_key(), memkey.data());
+  Slice mem_key = key.memtable_key();
+  Slice user_key = key.user_key();
+
+  std::shared_ptr<MemTableRep::Iterator> iter;
+  if (prefix_bloom_ &&
+      !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
+    // iter is null if prefix bloom says the key does not exist
+  } else {
+    iter = table_->GetIterator(user_key);
+    iter->Seek(user_key, mem_key.data());
+  }
 
   bool merge_in_progress = s->IsMergeInProgress();
   auto merge_operator = options.merge_operator.get();
@@ -179,7 +227,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   std::string merge_result;
 
   bool found_final_value = false;
-  for (; !found_final_value && iter->Valid(); iter->Next()) {
+  for (; !found_final_value && iter && iter->Valid(); iter->Next()) {
     // entry format is:
     //    klength  varint32
     //    userkey  char[klength-8]
@@ -278,11 +326,12 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
                       const Slice& key,
                       const Slice& value) {
   LookupKey lkey(key, seq);
-  Slice memkey = lkey.memtable_key();
+  Slice mem_key = lkey.memtable_key();
 
   std::shared_ptr<MemTableRep::Iterator> iter(
     table_->GetIterator(lkey.user_key()));
-  iter->Seek(key, memkey.data());
+  iter->Seek(key, mem_key.data());
+
 
   if (iter->Valid()) {
     // entry format is:
@@ -319,6 +368,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
                           VarintLength(value.size()) +
                           value.size())
             );
+            // no need to update bloom, as user key does not change.
             return true;
           }
         }
diff --git a/db/memtable.h b/db/memtable.h
index 7edb5681d..946c99bf2 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -17,6 +17,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
 #include "util/arena_impl.h"
+#include "util/dynamic_bloom.h"
 
 namespace rocksdb {
 
@@ -171,6 +172,9 @@ class MemTable {
 
   // Get the lock associated for the key
   port::RWMutex* GetLock(const Slice& key);
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> prefix_bloom_;
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index d76285381..f66091d11 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -16,12 +16,15 @@ DEFINE_bool(trigger_deadlock, false,
 DEFINE_uint64(bucket_count, 100000, "number of buckets");
 DEFINE_uint64(num_locks, 10001, "number of locks");
 DEFINE_bool(random_prefix, false, "randomize prefix");
-DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
-DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
-DEFINE_int64(write_buffer_size, 1000000000, "");
-DEFINE_int64(max_write_buffer_number, 8, "");
-DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int64(max_write_buffer_number, 2, "");
+DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
 DEFINE_int32(skiplist_height, 4, "");
+DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
+DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
+DEFINE_int32(value_size, 40, "");
 
 // Path to the database on file system
 const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
@@ -120,6 +123,9 @@ class PrefixTest {
       }
     }
 
+    options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
+    options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
+
     Status s = DB::Open(options, kDbName,  &db);
     ASSERT_OK(s);
     return std::shared_ptr<DB>(db);
@@ -147,18 +153,28 @@ TEST(PrefixTest, DynamicPrefixIterator) {
     std::random_shuffle(prefixes.begin(), prefixes.end());
   }
 
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_put_comparison;
+
   // insert x random prefix, each with y continuous element.
   for (auto prefix : prefixes) {
      for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
       TestKey test_key(prefix, sorted);
 
       Slice key = TestKeyToSlice(test_key);
-      std::string value(40, 0);
+      std::string value(FLAGS_value_size, 0);
 
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
       ASSERT_OK(db->Put(write_options, key, value));
+      hist_put_time.Add(timer.ElapsedNanos());
+      hist_put_comparison.Add(perf_context.user_key_comparison_count);
     }
   }
 
+  std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+            << "Put time: \n" << hist_put_time.ToString();
+
   // test seek existing keys
   HistogramImpl hist_seek_time;
   HistogramImpl hist_seek_comparison;
@@ -200,7 +216,7 @@ TEST(PrefixTest, DynamicPrefixIterator) {
   HistogramImpl hist_no_seek_comparison;
 
   for (auto prefix = FLAGS_total_prefixes;
-       prefix < FLAGS_total_prefixes + 100;
+       prefix < FLAGS_total_prefixes + 10000;
        prefix++) {
     TestKey test_key(prefix, 0);
     Slice key = TestKeyToSlice(test_key);
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 30ea3cf41..c50c7b61a 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -99,7 +99,7 @@ class MemTableRep {
     virtual void Prev() = 0;
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& user_key, const char* memtable_key) = 0;
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 2befb05b9..5041ea593 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -635,6 +635,14 @@ struct Options {
   // Number of locks used for inplace update
   // Default: 10000, if inplace_update_support = true, else 0.
   size_t inplace_update_num_locks;
+
+  // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
+  // for memtable
+  uint32_t memtable_prefix_bloom_bits;
+
+  // number of hash probes per key
+  uint32_t memtable_prefix_bloom_probes;
+
 };
 
 //
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 9dbd5d2cc..2c430e203 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -7,12 +7,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <gflags/gflags.h>
+
 #include "rocksdb/filter_policy.h"
 
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
+DEFINE_int32(bits_per_key, 10, "");
+
 namespace rocksdb {
 
 static const int kVerbose = 1;
@@ -29,7 +33,7 @@ class BloomTest {
   std::vector<std::string> keys_;
 
  public:
-  BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
+  BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { }
 
   ~BloomTest() {
     delete policy_;
@@ -160,5 +164,7 @@ TEST(BloomTest, VaryingLengths) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
   return rocksdb::test::RunAllTests();
 }
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
new file mode 100644
index 000000000..be47ab55a
--- /dev/null
+++ b/util/dynamic_bloom.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "dynamic_bloom.h"
+
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+static uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+}
+
+DynamicBloom::DynamicBloom(uint32_t total_bits,
+                           uint32_t (*hash_func)(const Slice& key),
+                           uint32_t num_probes)
+    : hash_func_(hash_func),
+      total_bits_((total_bits + 7) / 8 * 8),
+      num_probes_(num_probes) {
+  assert(hash_func_);
+  assert(num_probes_ > 0);
+  assert(total_bits_ > 0);
+  data_.reset(new unsigned char[total_bits_ / 8]());
+}
+
+DynamicBloom::DynamicBloom(uint32_t total_bits,
+                           uint32_t num_probes)
+    : hash_func_(&BloomHash),
+      total_bits_((total_bits + 7) / 8 * 8),
+      num_probes_(num_probes) {
+  assert(num_probes_ > 0);
+  assert(total_bits_ > 0);
+  data_.reset(new unsigned char[total_bits_ / 8]());
+}
+
+void DynamicBloom::Add(const Slice& key) {
+  uint32_t h = hash_func_(key);
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  for (uint32_t i = 0; i < num_probes_; i++) {
+    const uint32_t bitpos = h % total_bits_;
+    data_[bitpos/8] |= (1 << (bitpos % 8));
+    h += delta;
+  }
+}
+
+bool DynamicBloom::MayContain(const Slice& key) {
+  uint32_t h = hash_func_(key);
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  for (uint32_t i = 0; i < num_probes_; i++) {
+    const uint32_t bitpos = h % total_bits_;
+    if ((data_[bitpos/8] & (1 << (bitpos % 8)))
+        == 0) return false;
+    h += delta;
+  }
+  return true;
+}
+
+}
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
new file mode 100644
index 000000000..c496e2ce7
--- /dev/null
+++ b/util/dynamic_bloom.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+namespace rocksdb {
+
+class Slice;
+
+class DynamicBloom {
+
+ public:
+
+  // total_bits: fixed total bits for the bloom
+  // hash_func:  customized hash function
+  // num_probes: number of hash probes for a single key
+  DynamicBloom(uint32_t total_bits,
+               uint32_t (*hash_func)(const Slice& key),
+               uint32_t num_probes = 6);
+
+  explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6);
+
+  // Assuming single threaded access to Add
+  void Add(const Slice& key);
+
+  // Multithreaded access to MayContain is OK
+  bool MayContain(const Slice& key);
+
+
+ private:
+  uint32_t (*hash_func_)(const Slice& key);
+  uint32_t total_bits_;
+  uint32_t num_probes_;
+  std::unique_ptr<unsigned char[]> data_;
+};
+
+}
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
new file mode 100644
index 000000000..58f05ae50
--- /dev/null
+++ b/util/dynamic_bloom_test.cc
@@ -0,0 +1,113 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <gflags/gflags.h>
+
+#include "dynamic_bloom.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+DEFINE_int32(bits_per_key, 10, "");
+DEFINE_int32(num_probes, 6, "");
+
+namespace rocksdb {
+
+static Slice Key(int i, char* buffer) {
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+class DynamicBloomTest {
+};
+
+TEST(DynamicBloomTest, EmptyFilter) {
+  DynamicBloom bloom(100, 2);
+  ASSERT_TRUE(! bloom.MayContain("hello"));
+  ASSERT_TRUE(! bloom.MayContain("world"));
+}
+
+TEST(DynamicBloomTest, Small) {
+  DynamicBloom bloom(100, 2);
+  bloom.Add("hello");
+  bloom.Add("world");
+  ASSERT_TRUE(bloom.MayContain("hello"));
+  ASSERT_TRUE(bloom.MayContain("world"));
+  ASSERT_TRUE(! bloom.MayContain("x"));
+  ASSERT_TRUE(! bloom.MayContain("foo"));
+}
+
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+TEST(DynamicBloomTest, VaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  fprintf(stderr, "bits_per_key: %d  num_probes: %d\n",
+          FLAGS_bits_per_key, FLAGS_num_probes);
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    uint32_t bloom_bits = std::max(length * FLAGS_bits_per_key, 64);
+    DynamicBloom bloom(bloom_bits, FLAGS_num_probes);
+    for (int i = 0; i < length; i++) {
+      bloom.Add(Key(i, buffer));
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+    }
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer)))
+        << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    double rate = result / 10000.0;
+
+    fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; \n",
+            rate*100.0, length);
+
+    //ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125)
+      mediocre_filters++;  // Allowed, but not too often
+    else
+      good_filters++;
+  }
+
+  fprintf(stderr, "Filters: %d good, %d mediocre\n",
+          good_filters, mediocre_filters);
+
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+// Different bits-per-byte
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index 3ca6835bd..95e6edfae 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -119,11 +119,11 @@ class HashSkipListRep : public MemTableRep {
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& user_key, const char* memtable_key) {
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
       if (list_ != nullptr) {
         const char* encoded_key =
             (memtable_key != nullptr) ?
-                memtable_key : EncodeKey(&tmp_, user_key);
+                memtable_key : EncodeKey(&tmp_, internal_key);
         iter_.Seek(encoded_key);
       }
     }
diff --git a/util/options.cc b/util/options.cc
index 198d55384..c89d45bb0 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -101,7 +101,9 @@ Options::Options()
       table_factory(
         std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
       inplace_update_support(false),
-      inplace_update_num_locks(10000) {
+      inplace_update_num_locks(10000),
+      memtable_prefix_bloom_bits(0),
+      memtable_prefix_bloom_probes(6) {
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -292,6 +294,11 @@ Options::Dump(Logger* log) const
         inplace_update_support);
     Log(log, "                Options.inplace_update_num_locks: %zd",
         inplace_update_num_locks);
+    // TODO: easier config for bloom (maybe based on avg key/value size)
+    Log(log, "              Options.memtable_prefix_bloom_bits: %d",
+        memtable_prefix_bloom_bits);
+    Log(log, "            Options.memtable_prefix_bloom_probes: %d",
+        memtable_prefix_bloom_probes);
 }   // Options::Dump
 
 //

From 95a411d8530c5de173238ff34400f57df6bb8b1f Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 10 Dec 2013 20:03:27 -0800
Subject: [PATCH 25/70] When flushing mem tables, create iterators out of mutex

Summary:
creating new iterators of mem tables can be expensive. Move them out of mutex.
DBImpl::WriteLevel0Table()'s mems seems to be a local vector and is only used by flushing. memtables to flush are also immutable, so it should be safe to do so.

Test Plan: make all check

Reviewers: haobo, dhruba, kailiu

Reviewed By: dhruba

CC: igor, leveldb

Differential Revision: https://reviews.facebook.net/D14577
---
 db/db_impl.cc | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index d29449c4e..895bb20cc 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1054,27 +1054,26 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
   *filenumber = meta.number;
   pending_outputs_.insert(meta.number);
 
-  std::vector<Iterator*> list;
-  for (MemTable* m : mems) {
-    Log(options_.info_log,
-        "Flushing memtable with log file: %lu\n",
-        (unsigned long)m->GetLogNumber());
-    list.push_back(m->NewIterator());
-  }
-  Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0],
-                                      list.size());
   const SequenceNumber newest_snapshot = snapshots_.GetNewest();
   const SequenceNumber earliest_seqno_in_memtable =
     mems[0]->GetFirstSequenceNumber();
-  Log(options_.info_log,
-      "Level-0 flush table #%lu: started",
-      (unsigned long)meta.number);
-
   Version* base = versions_->current();
   base->Ref();          // it is likely that we do not need this reference
   Status s;
   {
     mutex_.Unlock();
+    std::vector<Iterator*> list;
+    for (MemTable* m : mems) {
+      Log(options_.info_log,
+          "Flushing memtable with log file: %lu\n",
+          (unsigned long)m->GetLogNumber());
+      list.push_back(m->NewIterator());
+    }
+    Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0],
+                                        list.size());
+    Log(options_.info_log,
+        "Level-0 flush table #%lu: started",
+        (unsigned long)meta.number);
     // We skip compression if universal compression is used and the size
     // threshold is set for compression.
     bool enable_compression = (options_.compaction_style
@@ -1085,15 +1084,15 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
                    user_comparator(), newest_snapshot,
                    earliest_seqno_in_memtable, enable_compression);
     LogFlush(options_.info_log);
+    delete iter;
+    Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s",
+        (unsigned long) meta.number,
+        (unsigned long) meta.file_size,
+        s.ToString().c_str());
     mutex_.Lock();
   }
   base->Unref();
 
-  Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s",
-      (unsigned long) meta.number,
-      (unsigned long) meta.file_size,
-      s.ToString().c_str());
-  delete iter;
 
   // re-acquire the most current version
   base = versions_->current();

From 41349d9ef1fb1cdeb9c7b942678199c90db2ce22 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 9 Dec 2013 14:28:26 -0800
Subject: [PATCH 26/70] [RocksDB Performance Branch] Avoid sorting in
 Version::Get() by presorting them in VersionSet::Builder::SaveTo()

Summary: Pre-sort files in VersionSet::Builder::SaveTo() so that when getting the value, no need to sort them. It can avoid the costs of vector operations and sorting in Version::Get().

Test Plan: make all check

Reviewers: haobo, kailiu, dhruba

Reviewed By: dhruba

CC: nkg-, igor, leveldb

Differential Revision: https://reviews.facebook.net/D14409
---
 db/version_set.cc | 76 +++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 79b53af45..653695caf 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -453,17 +453,12 @@ void Version::Get(const ReadOptions& options,
   // levels.  Therefore we are guaranteed that if we find data
   // in an smaller level, later levels are irrelevant (unless we
   // are MergeInProgress).
-  std::vector<FileMetaData*> important_files;
   for (int level = 0; level < vset_->NumberLevels(); level++) {
     size_t num_files = files_[level].size();
     if (num_files == 0) continue;
 
     // Get the list of files to search in this level
     FileMetaData* const* files = &files_[level][0];
-    important_files.clear();
-    if (level == 0) {
-      important_files.reserve(num_files);
-    }
 
     // Some files may overlap each other. We find
     // all files that overlap user_key and process them in order from
@@ -481,44 +476,42 @@ void Version::Get(const ReadOptions& options,
       start_index = FindFile(vset_->icmp_, files_[level], ikey);
     }
 
-    // Traverse the list, finding all overlapping files.
-    for (uint32_t i = start_index; i < num_files; i++) {
+    // Traverse each relevant file to find the desired key
+#ifndef NDEBUG
+    FileMetaData* prev_file = nullptr;
+#endif
+    for (uint32_t i = start_index; i < num_files; ++i) {
       FileMetaData* f = files[i];
-      if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
-          ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
-        important_files.push_back(f);
-      } else if (level > 0) {
-        // If on Level-n (n>=1) then the files are sorted.
-        // So we can stop looking when we are past the ikey.
-        break;
-      }
-    }
-
-    if (important_files.empty()) continue;
-
-    if (level == 0) {
-      if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
-        std::sort(important_files.begin(), important_files.end(), NewestFirstBySeqNo);
-      } else {
-        std::sort(important_files.begin(), important_files.end(), NewestFirst);
+      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0 ||
+          ucmp->Compare(user_key, f->largest.user_key()) > 0) {
+        // Only process overlapping files.
+        if (level > 0) {
+          // If on Level-n (n>=1) then the files are sorted.
+          // So we can stop looking when we are past the ikey.
+          break;
+        }
+        // TODO: do we want to check file ranges for level0 files at all?
+        // For new SST format where Get() is fast, we might want to consider
+        // to avoid those two comparisons, if it can filter out too few files.
+        continue;
       }
-    } else {
-      // Sanity check to make sure that the files are correctly sorted
 #ifndef NDEBUG
-      num_files = important_files.size();
-      for (uint32_t i = 1; i < num_files; ++i) {
-        FileMetaData* a = important_files[i-1];
-        FileMetaData* b = important_files[i];
-        int comp_sign = vset_->icmp_.Compare(a->largest, b->smallest);
-        assert(comp_sign < 0);
+      // Sanity check to make sure that the files are correctly sorted
+      if (prev_file) {
+        if (level != 0) {
+          int comp_sign = vset_->icmp_.Compare(prev_file->largest, f->smallest);
+          assert(comp_sign < 0);
+        } else {
+          // level == 0, the current file cannot be newer than the previous one.
+          if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+            assert(!NewestFirstBySeqNo(f, prev_file));
+          } else {
+            assert(!NewestFirst(f, prev_file));
+          }
+        }
       }
+      prev_file = f;
 #endif
-    }
-
-    // Traverse each relevant file to find the desired key
-    num_files = important_files.size();
-    for (uint32_t i = 0; i < num_files; ++i) {
-      FileMetaData* f = important_files[i];
       bool tableIO = false;
       *status = vset_->table_cache_->Get(options, f->number, f->file_size,
                                          ikey, &saver, SaveValue, &tableIO,
@@ -1117,6 +1110,13 @@ class VersionSet::Builder {
         MaybeAddFile(v, level, *base_iter);
       }
     }
+    // Pre-sort level0 for Get()
+    if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+      std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
+    } else {
+      std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
+    }
+
     CheckConsistency(v);
   }
 

From f37a59769b64d5891ca22a8d5e317d2f2a0aefc8 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 11 Dec 2013 13:52:54 -0800
Subject: [PATCH 27/70] Updating regression test based on master's version

---
 build_tools/regression_build_test.sh | 116 +++++++++++++++++++++++----
 1 file changed, 99 insertions(+), 17 deletions(-)

diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
index b0140ef48..b0c130e3c 100755
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@@ -26,15 +26,20 @@ function cleanup {
 }
 
 trap cleanup EXIT
-git_br=$(basename $GIT_BRANCH)
+
+if [ -z $GIT_BRANCH ]; then
+  git_br=`git rev-parse --abbrev-ref HEAD`
+else
+  git_br=$(basename $GIT_BRANCH)
+fi
+
 if [ $git_br == "master" ]; then
   git_br=""
 else
   git_br="."$git_br
 fi
 
-make clean
-OPT=-DNDEBUG make db_bench -j$(nproc)
+make release
 
 # measure fillseq + fill up the DB for overwrite benchmark
 ./db_bench \
@@ -45,7 +50,8 @@ OPT=-DNDEBUG make db_bench -j$(nproc)
     --num=$NUM \
     --writes=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=6 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
     --histogram=1 \
@@ -60,9 +66,10 @@ OPT=-DNDEBUG make db_bench -j$(nproc)
     --use_existing_db=1 \
     --bloom_bits=10 \
     --num=$NUM \
-    --writes=$((NUM / 2)) \
+    --writes=$((NUM / 10)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=6  \
+    --cache_numshardbits=4  \
+    --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
     --histogram=1 \
@@ -80,7 +87,8 @@ OPT=-DNDEBUG make db_bench -j$(nproc)
     --num=$NUM \
     --writes=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=6 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
     --histogram=1 \
@@ -96,9 +104,10 @@ OPT=-DNDEBUG make db_bench -j$(nproc)
     --use_existing_db=1 \
     --bloom_bits=10 \
     --num=$NUM \
-    --reads=$NUM \
+    --reads=$((NUM / 5)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=8 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
     --statistics=1 \
@@ -106,26 +115,90 @@ OPT=-DNDEBUG make db_bench -j$(nproc)
     --disable_data_sync=1 \
     --disable_wal=1 \
     --sync=0 \
-    --threads=32 > ${STAT_FILE}.readrandom
+    --threads=16 > ${STAT_FILE}.readrandom
 
-# measure readrandom with 300MB block cache
+# measure readrandom with 100MB block cache
 ./db_bench \
     --benchmarks=readrandom \
     --db=$DATA_DIR \
     --use_existing_db=1 \
     --bloom_bits=10 \
     --num=$NUM \
-    --reads=$NUM \
-    --cache_size=314572800 \
-    --cache_numshardbits=8 \
+    --reads=$((NUM / 5)) \
+    --cache_size=104857600 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomsmallblockcache
+
+# measure readrandom with 8k data in memtable
+./db_bench \
+    --benchmarks=overwrite,readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --writes=512 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_mem_sst
+
+
+# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
+./db_bench \
+    --benchmarks=filluniquerandom \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --writes=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# measure readrandom after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
+    --disable_auto_compactions=1 \
     --statistics=1 \
     --histogram=1 \
     --disable_data_sync=1 \
     --disable_wal=1 \
     --sync=0 \
-    --threads=32 > ${STAT_FILE}.readrandomsmallblockcache
+    --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
 
 # measure memtable performance -- none of the data gets flushed to disk
 ./db_bench \
@@ -135,7 +208,8 @@ OPT=-DNDEBUG make db_bench -j$(nproc)
     --num=$((NUM / 10)) \
     --reads=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=8 \
+    --cache_numshardbits=4 \
+    --table_cache_numshardbits=4 \
     --write_buffer_size=1000000000 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
@@ -145,13 +219,19 @@ OPT=-DNDEBUG make db_bench -j$(nproc)
     --disable_wal=1 \
     --sync=0 \
     --value_size=10 \
-    --threads=32 > ${STAT_FILE}.memtablefillreadrandom
+    --threads=16 > ${STAT_FILE}.memtablefillreadrandom
 
 # send data to ods
 function send_to_ods {
   key="$1"
   value="$2"
 
+  if [ -z $JENKINS_HOME ]; then
+    # running on devbox, just print out the values
+    echo $1 $2
+    return
+  fi
+
   if [ -z "$value" ];then
     echo >&2 "ERROR: Key $key doesn't have a value."
     return
@@ -180,5 +260,7 @@ send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
 send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
 send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
 send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
+send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
+send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
 send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
 send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom

From aaf9c6203c21b9056b3bf4654fee58ad8473240b Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Wed, 11 Dec 2013 11:56:36 -0800
Subject: [PATCH 28/70] [RocksDB][Performance Branch]Iterator Cleanup method
 only tries to find obsolete files if it has the last reference to a version

Summary: When deconstructing an iterator, no need to check obsolete file if it doesn't hold last reference of any version.

Test Plan: make all check

Reviewers: haobo, igor, dhruba, kailiu

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14595
---
 db/db_impl.cc     | 7 ++++---
 db/version_set.cc | 4 +++-
 db/version_set.h  | 4 +++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 895bb20cc..192232483 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2515,9 +2515,10 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
       deletion_state.memtables_to_free.push_back(m);
     }
   }
-  state->version->Unref();
-  // fast path FindObsoleteFiles
-  state->db->FindObsoleteFiles(deletion_state, false, true);
+  if (state->version->Unref()) {
+    // fast path FindObsoleteFiles
+    state->db->FindObsoleteFiles(deletion_state, false, true);
+  }
   state->mu->Unlock();
   state->db->PurgeObsoleteFiles(deletion_state);
   delete state;
diff --git a/db/version_set.cc b/db/version_set.cc
index 653695caf..74a33b6fb 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -591,13 +591,15 @@ void Version::Ref() {
   ++refs_;
 }
 
-void Version::Unref() {
+bool Version::Unref() {
   assert(this != &vset_->dummy_versions_);
   assert(refs_ >= 1);
   --refs_;
   if (refs_ == 0) {
     delete this;
+    return true;
   }
+  return false;
 }
 
 bool Version::OverlapInLevel(int level,
diff --git a/db/version_set.h b/db/version_set.h
index bf466a932..aab4b82bc 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -89,7 +89,9 @@ class Version {
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
   void Ref();
-  void Unref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
 
   void GetOverlappingInputs(
       int level,

From e8ab1934d9cb3ffebd61097d67bb23439554b265 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 12 Dec 2013 10:54:03 -0800
Subject: [PATCH 29/70] [RocksDB Performance Branch]
 DBImpl.NewInternalIterator() to reduce works inside mutex

Summary: To reduce mutex contention caused by DBImpl.NewInternalIterator(), in this function, move all the iteration creation works out of mutex, only leaving object ref and get.

Test Plan:
make all check
will run db_stress for a while too to make sure no problem.

Reviewers: haobo, dhruba, kailiu

Reviewed By: haobo

CC: igor, leveldb

Differential Revision: https://reviews.facebook.net/D14589
---
 db/db_impl.cc | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 192232483..361a8c37a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2528,39 +2528,40 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
   IterState* cleanup = new IterState;
-  mutex_.Lock();
-  *latest_snapshot = versions_->LastSequence();
+  MemTable* mutable_mem;
+  std::vector<MemTable*> immutables;
+  Version* version;
 
   // Collect together all needed child iterators for mem
-  std::vector<Iterator*> list;
+  mutex_.Lock();
+  *latest_snapshot = versions_->LastSequence();
   mem_->Ref();
-  list.push_back(mem_->NewIterator(options));
-
-  cleanup->mem.push_back(mem_);
-
+  mutable_mem = mem_;
   // Collect together all needed child iterators for imm_
-  std::vector<MemTable*> immutables;
   imm_.GetMemTables(&immutables);
   for (unsigned int i = 0; i < immutables.size(); i++) {
-    MemTable* m = immutables[i];
-    m->Ref();
+    immutables[i]->Ref();
+  }
+  // Collect iterators for files in L0 - Ln
+  versions_->current()->Ref();
+  version = versions_->current();
+  mutex_.Unlock();
+
+  std::vector<Iterator*> list;
+  list.push_back(mutable_mem->NewIterator(options));
+  cleanup->mem.push_back(mutable_mem);
+  for (MemTable* m : immutables) {
     list.push_back(m->NewIterator(options));
     cleanup->mem.push_back(m);
   }
-
-  // Collect iterators for files in L0 - Ln
-  versions_->current()->AddIterators(options, storage_options_, &list);
+  version->AddIterators(options, storage_options_, &list);
   Iterator* internal_iter =
       NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
-  versions_->current()->Ref();
-
+  cleanup->version = version;
   cleanup->mu = &mutex_;
   cleanup->db = this;
-  cleanup->version = versions_->current();
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
-  mutex_.Unlock();
-  LogFlush(options_.info_log);
   return internal_iter;
 }
 

From bc9b488e922647bb4d7056ccf846a2b61df5dbb6 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Thu, 12 Dec 2013 15:34:24 -0800
Subject: [PATCH 30/70] fix a warning in db_test when running `make release`

---
 db/db_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index 0e94981d2..a8d71e2f8 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2596,6 +2596,7 @@ class ChangeFilter : public CompactionFilter {
                       const Slice& value, std::string* new_value,
                       bool* value_changed) const override {
     assert(argv_ == 100);
+    argv_ = argv_ + 0;
     assert(new_value != nullptr);
     *new_value = NEW_VALUE;
     *value_changed = true;

From 0e24f97b9f75cd20622e5269a4b95e2347188119 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Thu, 12 Dec 2013 15:40:44 -0800
Subject: [PATCH 31/70] Revert last commit and add "unused" attribute to
 suppress warning

---
 db/db_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index a8d71e2f8..918126bbe 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2596,7 +2596,6 @@ class ChangeFilter : public CompactionFilter {
                       const Slice& value, std::string* new_value,
                       bool* value_changed) const override {
     assert(argv_ == 100);
-    argv_ = argv_ + 0;
     assert(new_value != nullptr);
     *new_value = NEW_VALUE;
     *value_changed = true;
@@ -2608,7 +2607,7 @@ class ChangeFilter : public CompactionFilter {
   }
 
  private:
-  const int argv_;
+  const int __attribute__((unused)) argv_;
 };
 
 class KeepFilterFactory : public CompactionFilterFactory {

From 0cd1521af5b79257820a1c7c1d131ba3393c9e12 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Thu, 12 Dec 2013 16:36:38 -0800
Subject: [PATCH 32/70] Completely remove argv_ since no one use it

There are still warning in some other environment, just move that useless variable `argv_`
---
 db/db_test.cc | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 918126bbe..27e90f110 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2590,12 +2590,11 @@ class DeleteFilter : public CompactionFilter {
 
 class ChangeFilter : public CompactionFilter {
  public:
-  explicit ChangeFilter(int argv) : argv_(argv) {}
+  explicit ChangeFilter() {}
 
   virtual bool Filter(int level, const Slice& key,
                       const Slice& value, std::string* new_value,
                       bool* value_changed) const override {
-    assert(argv_ == 100);
     assert(new_value != nullptr);
     *new_value = NEW_VALUE;
     *value_changed = true;
@@ -2605,9 +2604,6 @@ class ChangeFilter : public CompactionFilter {
   virtual const char* Name() const override {
     return "ChangeFilter";
   }
-
- private:
-  const int __attribute__((unused)) argv_;
 };
 
 class KeepFilterFactory : public CompactionFilterFactory {
@@ -2636,19 +2632,16 @@ class DeleteFilterFactory : public CompactionFilterFactory {
 
 class ChangeFilterFactory : public CompactionFilterFactory {
   public:
-    explicit ChangeFilterFactory(int argv) : argv_(argv) {}
+    explicit ChangeFilterFactory() {}
 
     virtual std::unique_ptr<CompactionFilter>
     CreateCompactionFilter(const CompactionFilter::Context& context) override {
-      return std::unique_ptr<CompactionFilter>(new ChangeFilter(argv_));
+      return std::unique_ptr<CompactionFilter>(new ChangeFilter());
     }
 
     virtual const char* Name() const override {
       return "ChangeFilterFactory";
     }
-
-  private:
-    const int argv_;
 };
 
 TEST(DBTest, CompactionFilter) {
@@ -2795,7 +2788,7 @@ TEST(DBTest, CompactionFilterWithValueChange) {
     options.num_levels = 3;
     options.max_mem_compaction_level = 0;
     options.compaction_filter_factory =
-      std::make_shared<ChangeFilterFactory>(100);
+      std::make_shared<ChangeFilterFactory>();
     Reopen(&options);
 
     // Write 100K+1 keys, these are written to a few files

From 9718c790ec286fe2dad70dea491b54c34e5547a7 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 12 Dec 2013 18:42:07 -0800
Subject: [PATCH 33/70] [Performance Branch] Fix a bug of PlainTable when
 building indexes

Summary:
PlainTable now has a bug of the ordering of indexes for the prefixes in the same bucket. I thought std::map guaranteed key order but it didn't, probably because I didn't use it properly. But seems to me that we don't need to make extra sorting as input prefixes are already sorted. Found by problem by running leaf4 against plain table. Replace the map with a vector. It should performs better too.

After the fix, leaf4 unit tests are passing.

Test Plan:
run plain_table_db_test
Also going to run db_test with plain table in the uncommitted branch.

Reviewers: haobo, kailiu

Reviewed By: haobo

CC: nkg-, leveldb

Differential Revision: https://reviews.facebook.net/D14649
---
 table/plain_table_reader.cc | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index cccaf61c9..e808948ab 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -37,14 +37,6 @@ public:
     return MurmurHash(s.data(), s.size(), 397);
   }
 };
-
-class slice_comparator {
-public:
-  bool operator()(rocksdb::Slice const& s1,
-                  rocksdb::Slice const& s2) const {
-    return s1.compare(s2) < 0;
-  }
-};
 }
 
 namespace rocksdb {
@@ -146,8 +138,8 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
   HistogramImpl keys_per_prefix_hist;
   // Need map to be ordered to make sure sub indexes generated
   // are in order.
-  std::map<Slice, std::string, std::slice_comparator> prefix2map;
-
+  std::vector<std::pair<Slice, std::string>> prefix_index_pairs;
+  std::string current_prefix_index;
   while (pos < file_size) {
     uint32_t key_offset = pos;
     status_ = Next(pos, &key_slice, &value_slice, pos);
@@ -156,6 +148,11 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
     if (first || prev_key_prefix_slice != key_prefix_slice) {
       if (!first) {
         keys_per_prefix_hist.Add(key_index_within_prefix);
+        prefix_index_pairs.push_back(
+            std::make_pair<Slice, std::string>(
+                std::move(prev_key_prefix_slice),
+                std::move(current_prefix_index)));
+        current_prefix_index.clear();
       }
       key_index_within_prefix = 0;
       prev_key_prefix_slice = key_prefix_slice;
@@ -163,27 +160,30 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
 
     if (key_index_within_prefix++ % 8 == 0) {
       // Add an index key for every 8 keys
-      std::string& prefix_index = prefix2map[key_prefix_slice];
-      PutFixed32(&prefix_index, key_offset);
+      PutFixed32(&current_prefix_index, key_offset);
     }
     first = false;
   }
+  prefix_index_pairs.push_back(
+      std::make_pair<Slice, std::string>(std::move(prev_key_prefix_slice),
+                                         std::move(current_prefix_index)));
+
   keys_per_prefix_hist.Add(key_index_within_prefix);
   if (hash_table_ != nullptr) {
     delete[] hash_table_;
   }
   std::vector<Slice> filter_entries(0); // for creating bloom filter;
   if (filter_policy_ != nullptr) {
-    filter_entries.reserve(prefix2map.size());
+    filter_entries.reserve(prefix_index_pairs.size());
   }
   double hash_table_size_multipier =
       (hash_table_ratio_ > 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
-  hash_table_size_ = prefix2map.size() * hash_table_size_multipier + 1;
+  hash_table_size_ = prefix_index_pairs.size() * hash_table_size_multipier + 1;
   hash_table_ = new uint32_t[hash_table_size_];
   std::vector<std::string> hash2map(hash_table_size_);
 
   size_t sub_index_size_needed = 0;
-  for (auto& p: prefix2map) {
+  for (auto& p: prefix_index_pairs) {
     auto& sub_index = hash2map[getBucketId(p.first, key_prefix_len_,
                                            hash_table_size_)];
     if (sub_index.length() > 0 || p.second.length() > kOffsetLen) {

From b660e2d468888e28f8c2badd2fa8639c0c7eeefe Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Tue, 10 Dec 2013 16:21:49 -0800
Subject: [PATCH 34/70] Expose usage info for the cache

Summary: This diff will help us to figure out the memory usage for the cache part.

Test Plan: added a new memory usage test for cache

Reviewers: haobo, sdong, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14559
---
 include/rocksdb/cache.h |  3 ++
 util/cache.cc           | 65 ++++++++++++++++++++++++-----------------
 util/cache_test.cc      | 34 ++++++++++++++++++++-
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 201d82f19..c8542f072 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -104,6 +104,9 @@ class Cache {
   // returns the maximum configured capacity of the cache
   virtual size_t GetCapacity() const = 0;
 
+  // returns the memory size for the entries residing in the cache.
+  virtual size_t GetUsage() const = 0;
+
  private:
   void LRU_Remove(Handle* e);
   void LRU_Append(Handle* e);
diff --git a/util/cache.cc b/util/cache.cc
index b9d41be49..8fceefc82 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -156,6 +156,7 @@ class LRUCache {
   Cache::Handle* Lookup(const Slice& key, uint32_t hash);
   void Release(Cache::Handle* handle);
   void Erase(const Slice& key, uint32_t hash);
+  size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
 
  private:
   void LRU_Remove(LRUHandle* e);
@@ -172,7 +173,7 @@ class LRUCache {
 
   // mutex_ protects the following state.
   port::Mutex mutex_;
-  size_t usage_;
+  std::atomic_size_t usage_;
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
@@ -214,7 +215,7 @@ void LRUCache::FreeEntry(LRUHandle* e) {
 void LRUCache::LRU_Remove(LRUHandle* e) {
   e->next->prev = e->prev;
   e->prev->next = e->next;
-  usage_ -= e->charge;
+  usage_.fetch_sub(e->charge, std::memory_order_relaxed);
 }
 
 void LRUCache::LRU_Append(LRUHandle* e) {
@@ -223,7 +224,7 @@ void LRUCache::LRU_Append(LRUHandle* e) {
   e->prev = lru_.prev;
   e->prev->next = e;
   e->next->prev = e;
-  usage_ += e->charge;
+  usage_.fetch_add(e->charge, std::memory_order_relaxed);
 }
 
 Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
@@ -282,7 +283,7 @@ Cache::Handle* LRUCache::Insert(
       // referenced by the cache first.
       LRUHandle* cur = lru_.next;
       for (unsigned int scanCount = 0;
-           usage_ > capacity_ && cur != &lru_
+           GetUsage() > capacity_ && cur != &lru_
            && scanCount < remove_scan_count_limit_; scanCount++) {
         LRUHandle* next = cur->next;
         if (cur->refs <= 1) {
@@ -298,7 +299,7 @@ Cache::Handle* LRUCache::Insert(
 
     // Free the space following strict LRU policy until enough space
     // is freed.
-    while (usage_ > capacity_ && lru_.next != &lru_) {
+    while (GetUsage() > capacity_ && lru_.next != &lru_) {
       LRUHandle* old = lru_.next;
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);
@@ -340,10 +341,10 @@ static int kRemoveScanCountLimit = 0; // default values, can be overridden
 
 class ShardedLRUCache : public Cache {
  private:
-  LRUCache* shard_;
+  LRUCache* shards_;
   port::Mutex id_mutex_;
   uint64_t last_id_;
-  int numShardBits;
+  int num_shard_bits_;
   size_t capacity_;
 
   static inline uint32_t HashSlice(const Slice& s) {
@@ -352,18 +353,18 @@ class ShardedLRUCache : public Cache {
 
   uint32_t Shard(uint32_t hash) {
     // Note, hash >> 32 yields hash in gcc, not the zero we expect!
-    return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0;
+    return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
   }
 
   void init(size_t capacity, int numbits, int removeScanCountLimit) {
-    numShardBits = numbits;
+    num_shard_bits_ = numbits;
     capacity_ = capacity;
-    int numShards = 1 << numShardBits;
-    shard_ = new LRUCache[numShards];
-    const size_t per_shard = (capacity + (numShards - 1)) / numShards;
-    for (int s = 0; s < numShards; s++) {
-      shard_[s].SetCapacity(per_shard);
-      shard_[s].SetRemoveScanCountLimit(removeScanCountLimit);
+    int num_shards = 1 << num_shard_bits_;
+    shards_ = new LRUCache[num_shards];
+    const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
+    for (int s = 0; s < num_shards; s++) {
+      shards_[s].SetCapacity(per_shard);
+      shards_[s].SetRemoveScanCountLimit(removeScanCountLimit);
     }
   }
 
@@ -372,30 +373,30 @@ class ShardedLRUCache : public Cache {
       : last_id_(0) {
     init(capacity, kNumShardBits, kRemoveScanCountLimit);
   }
-  ShardedLRUCache(size_t capacity, int numShardBits,
+  ShardedLRUCache(size_t capacity, int num_shard_bits,
                   int removeScanCountLimit)
      : last_id_(0) {
-    init(capacity, numShardBits, removeScanCountLimit);
+    init(capacity, num_shard_bits, removeScanCountLimit);
   }
   virtual ~ShardedLRUCache() {
-    delete[] shard_;
+    delete[] shards_;
   }
   virtual Handle* Insert(const Slice& key, void* value, size_t charge,
                          void (*deleter)(const Slice& key, void* value)) {
     const uint32_t hash = HashSlice(key);
-    return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
+    return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter);
   }
   virtual Handle* Lookup(const Slice& key) {
     const uint32_t hash = HashSlice(key);
-    return shard_[Shard(hash)].Lookup(key, hash);
+    return shards_[Shard(hash)].Lookup(key, hash);
   }
   virtual void Release(Handle* handle) {
     LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
-    shard_[Shard(h->hash)].Release(handle);
+    shards_[Shard(h->hash)].Release(handle);
   }
   virtual void Erase(const Slice& key) {
     const uint32_t hash = HashSlice(key);
-    shard_[Shard(hash)].Erase(key, hash);
+    shards_[Shard(hash)].Erase(key, hash);
   }
   virtual void* Value(Handle* handle) {
     return reinterpret_cast<LRUHandle*>(handle)->value;
@@ -407,6 +408,16 @@ class ShardedLRUCache : public Cache {
   virtual size_t GetCapacity() const {
     return capacity_;
   }
+  virtual size_t GetUsage() const {
+    // We will not lock the cache when getting the usage from shards.
+    // for (size_t i = 0; i < num_shard_bits_; ++i)
+    int num_shards = 1 << num_shard_bits_;
+    size_t usage = 0;
+    for (int s = 0; s < num_shards; s++) {
+      usage += shards_[s].GetUsage();
+    }
+    return usage;
+  }
 };
 
 }  // end anonymous namespace
@@ -415,17 +426,17 @@ shared_ptr<Cache> NewLRUCache(size_t capacity) {
   return NewLRUCache(capacity, kNumShardBits);
 }
 
-shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits) {
-  return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit);
+shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits) {
+  return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit);
 }
 
-shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
                               int removeScanCountLimit) {
-  if (numShardBits >= 20) {
+  if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
   return std::make_shared<ShardedLRUCache>(capacity,
-                                           numShardBits,
+                                           num_shard_bits,
                                            removeScanCountLimit);
 }
 
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 87ab91389..2e10bdc3a 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -107,6 +107,39 @@ class CacheTest {
 };
 CacheTest* CacheTest::current_;
 
+void dumbDeleter(const Slice& key, void* value) { }
+
+TEST(CacheTest, UsageTest) {
+  // cache is shared_ptr and will be automatically cleaned up.
+  const uint64_t kCapacity = 100000;
+  auto cache = NewLRUCache(kCapacity, 8, 200);
+
+  size_t usage = 0;
+  const char* value = "abcdef";
+  // make sure everything will be cached
+  for (int i = 1; i < 100; ++i) {
+    std::string key(i, 'a');
+    auto kv_size = key.size() + 5;
+    cache->Release(
+        cache->Insert(key, (void*)value, kv_size, dumbDeleter)
+    );
+    usage += kv_size;
+    ASSERT_EQ(usage, cache->GetUsage());
+  }
+
+  // make sure the cache will be overloaded
+  for (int i = 1; i < kCapacity; ++i) {
+    auto key = std::to_string(i);
+    cache->Release(
+        cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)
+    );
+  }
+
+  // the usage should be close to the capacity
+  ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+}
+
 TEST(CacheTest, HitAndMiss) {
   ASSERT_EQ(-1, Lookup(100));
 
@@ -353,7 +386,6 @@ void deleter(const Slice& key, void* value) {
   delete (Value *)value;
 }
 
-
 TEST(CacheTest, BadEviction) {
   int n = 10;
 

From 5090316f0ded4bf9af04d78cbacb031824d31c3f Mon Sep 17 00:00:00 2001
From: Haobo Xu <haobo@fb.com>
Date: Fri, 13 Dec 2013 14:19:18 -0800
Subject: [PATCH 35/70] [RocksDB] [Performance Branch] Trivia build fix

Summary: make release complains signed unsigned comparison.

Test Plan: make release

Reviewers: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14661
---
 util/cache_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/cache_test.cc b/util/cache_test.cc
index 2e10bdc3a..b99f47b38 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -128,7 +128,7 @@ TEST(CacheTest, UsageTest) {
   }
 
   // make sure the cache will be overloaded
-  for (int i = 1; i < kCapacity; ++i) {
+  for (uint64_t i = 1; i < kCapacity; ++i) {
     auto key = std::to_string(i);
     cache->Release(
         cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)

From 5f5e5fc2e95fdb048883928c05b127c4617ef290 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Fri, 13 Dec 2013 15:43:05 -0800
Subject: [PATCH 36/70] Revert `atomic_size_t usage`

Summary:
By disassemble the function, we found that the atomic variables do invoke the `lock` that locks the memory bus.
As a tradeoff, we protect the GetUsage by mutex and leave usage_ as plain size_t.

Test Plan: passed `cache_test`

Reviewers: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14667
---
 util/cache.cc | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/util/cache.cc b/util/cache.cc
index 8fceefc82..34a12d345 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -156,7 +156,13 @@ class LRUCache {
   Cache::Handle* Lookup(const Slice& key, uint32_t hash);
   void Release(Cache::Handle* handle);
   void Erase(const Slice& key, uint32_t hash);
-  size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
+  // Although in some platforms the update of size_t is atomic, to make sure
+  // GetUsage() works correctly under any platforms, we'll protect this
+  // function with mutex.
+  size_t GetUsage() const {
+    MutexLock l(&mutex_);
+    return usage_;
+  }
 
  private:
   void LRU_Remove(LRUHandle* e);
@@ -172,8 +178,10 @@ class LRUCache {
   uint32_t remove_scan_count_limit_;
 
   // mutex_ protects the following state.
-  port::Mutex mutex_;
-  std::atomic_size_t usage_;
+  // We don't count mutex_ as the cache's internal state so semantically we
+  // don't mind mutex_ invoking the non-const actions.
+  mutable port::Mutex mutex_;
+  size_t usage_;
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
@@ -215,7 +223,7 @@ void LRUCache::FreeEntry(LRUHandle* e) {
 void LRUCache::LRU_Remove(LRUHandle* e) {
   e->next->prev = e->prev;
   e->prev->next = e->next;
-  usage_.fetch_sub(e->charge, std::memory_order_relaxed);
+  usage_ -= e->charge;
 }
 
 void LRUCache::LRU_Append(LRUHandle* e) {
@@ -224,7 +232,7 @@ void LRUCache::LRU_Append(LRUHandle* e) {
   e->prev = lru_.prev;
   e->prev->next = e;
   e->next->prev = e;
-  usage_.fetch_add(e->charge, std::memory_order_relaxed);
+  usage_ += e->charge;
 }
 
 Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
@@ -283,7 +291,7 @@ Cache::Handle* LRUCache::Insert(
       // referenced by the cache first.
       LRUHandle* cur = lru_.next;
       for (unsigned int scanCount = 0;
-           GetUsage() > capacity_ && cur != &lru_
+           usage_ > capacity_ && cur != &lru_
            && scanCount < remove_scan_count_limit_; scanCount++) {
         LRUHandle* next = cur->next;
         if (cur->refs <= 1) {
@@ -299,7 +307,7 @@ Cache::Handle* LRUCache::Insert(
 
     // Free the space following strict LRU policy until enough space
     // is freed.
-    while (GetUsage() > capacity_ && lru_.next != &lru_) {
+    while (usage_ > capacity_ && lru_.next != &lru_) {
       LRUHandle* old = lru_.next;
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);

From 2e9efcd6d8d456b2800a8ae079c4df0ff05f090c Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Thu, 5 Dec 2013 16:51:26 -0800
Subject: [PATCH 37/70] Add the property block for the plain table

Summary:
This is the last diff that adds the property block to plain table.
The format resembles that of the block-based table: https://github.com/facebook/rocksdb/wiki/Rocksdb-table-format

  [data block]
  [meta block 1: stats block]
  [meta block 2: future extended block]
  ...
  [meta block K: future extended block]  (we may add more meta blocks in the future)
  [metaindex block]
  [index block: we only have the placeholder here, we can add persistent index block in the future]
  [Footer: contains magic number, handle to metaindex block and index block]
  <end_of_file>

Test Plan: extended existing property block test.

Reviewers: haobo, sdong, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14523
---
 db/table_properties_collector_test.cc | 258 +++++++++++++++-----------
 include/rocksdb/table_properties.h    |   1 +
 table/block_based_table_builder.cc    |   4 +-
 table/block_based_table_reader.cc     | 101 +---------
 table/block_based_table_reader.h      |   6 -
 table/meta_blocks.cc                  | 144 ++++++++++++++
 table/meta_blocks.h                   |  26 ++-
 table/plain_table_builder.cc          | 111 ++++++++++-
 table/plain_table_builder.h           |  10 +-
 table/plain_table_reader.cc           |  64 ++++---
 table/plain_table_reader.h            |  23 ++-
 table/table_properties.cc             |   2 +
 table/table_test.cc                   | 133 ++++++++-----
 13 files changed, 577 insertions(+), 306 deletions(-)

diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index bbac4aa64..b7ff97b34 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -12,7 +12,9 @@
 #include "db/table_properties_collector.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/table.h"
+#include "rocksdb/plain_table_factory.h"
 #include "table/block_based_table_factory.h"
+#include "table/meta_blocks.h"
 #include "util/coding.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -20,8 +22,6 @@
 namespace rocksdb {
 
 class TablePropertiesTest {
- private:
-  unique_ptr<TableReader> table_reader_;
 };
 
 // TODO(kailiu) the following classes should be moved to some more general
@@ -93,22 +93,6 @@ void MakeBuilder(
                                              options.compression));
 }
 
-void OpenTable(
-    const Options& options,
-    const std::string& contents,
-    std::unique_ptr<TableReader>* table_reader) {
-
-  std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
-  auto s = options.table_factory->GetTableReader(
-      options,
-      EnvOptions(),
-      std::move(file),
-      contents.size(),
-      table_reader
-  );
-  ASSERT_OK(s);
-}
-
 // Collects keys that starts with "A" in a table.
 class RegularKeysStartWithA: public TablePropertiesCollector {
  public:
@@ -141,23 +125,66 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
   uint32_t count_ = 0;
 };
 
-TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
-  Options options;
-
+extern uint64_t kBlockBasedTableMagicNumber;
+extern uint64_t kPlainTableMagicNumber;
+void TestCustomizedTablePropertiesCollector(
+    uint64_t magic_number,
+    bool encode_as_internal,
+    const Options& options) {
   // make sure the entries will be inserted with order.
   std::map<std::string, std::string> kvs = {
-    {"About",     "val5"},  // starts with 'A'
-    {"Abstract",  "val2"},  // starts with 'A'
-    {"Around",    "val7"},  // starts with 'A'
-    {"Beyond",    "val3"},
-    {"Builder",   "val1"},
-    {"Cancel",    "val4"},
-    {"Find",      "val6"},
+    {"About   ", "val5"},  // starts with 'A'
+    {"Abstract", "val2"},  // starts with 'A'
+    {"Around  ", "val7"},  // starts with 'A'
+    {"Beyond  ", "val3"},
+    {"Builder ", "val1"},
+    {"Cancel  ", "val4"},
+    {"Find    ", "val6"},
   };
 
+  // -- Step 1: build table
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  MakeBuilder(options, &writable, &builder);
+
+  for (const auto& kv : kvs) {
+    if (encode_as_internal) {
+      InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
+      builder->Add(ikey.Encode(), kv.second);
+    } else {
+      builder->Add(kv.first, kv.second);
+    }
+  }
+  ASSERT_OK(builder->Finish());
+
+  // -- Step 2: Read properties
+  FakeRandomeAccessFile readable(writable->contents());
+  TableProperties props;
+  Status s = ReadTableProperties(
+      &readable,
+      writable->contents().size(),
+      magic_number,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  ASSERT_OK(s);
+
+  auto user_collected = props.user_collected_properties;
+
+  ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+  uint32_t starts_with_A = 0;
+  Slice key(user_collected.at("Count"));
+  ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+  ASSERT_EQ(3u, starts_with_A);
+}
+
+TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
   // Test properties collectors with internal keys or regular keys
+  // for block based table
   for (bool encode_as_internal : { true, false }) {
-    // -- Step 1: build table
+    Options options;
     auto collector = new RegularKeysStartWithA();
     if (encode_as_internal) {
       options.table_properties_collectors = {
@@ -167,97 +194,114 @@ TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
       options.table_properties_collectors.resize(1);
       options.table_properties_collectors[0].reset(collector);
     }
-    std::unique_ptr<TableBuilder> builder;
-    std::unique_ptr<FakeWritableFile> writable;
-    MakeBuilder(options, &writable, &builder);
-
-    for (const auto& kv : kvs) {
-      if (encode_as_internal) {
-        InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
-        builder->Add(ikey.Encode(), kv.second);
-      } else {
-        builder->Add(kv.first, kv.second);
-      }
-    }
-    ASSERT_OK(builder->Finish());
-
-    // -- Step 2: Open table
-    std::unique_ptr<TableReader> table_reader;
-    OpenTable(options, writable->contents(), &table_reader);
-    const auto& properties =
-      table_reader->GetTableProperties().user_collected_properties;
-
-    ASSERT_EQ("Rocksdb", properties.at("TablePropertiesTest"));
-
-    uint32_t starts_with_A = 0;
-    Slice key(properties.at("Count"));
-    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
-    ASSERT_EQ(3u, starts_with_A);
+    TestCustomizedTablePropertiesCollector(
+        kBlockBasedTableMagicNumber,
+        encode_as_internal,
+        options
+    );
   }
+
+  // test plain table
+  Options options;
+  options.table_properties_collectors.push_back(
+      std::make_shared<RegularKeysStartWithA>()
+  );
+  options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
+  TestCustomizedTablePropertiesCollector(
+      kPlainTableMagicNumber, true, options
+  );
 }
 
-TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+void TestInternalKeyPropertiesCollector(
+    uint64_t magic_number,
+    bool sanitized,
+    std::shared_ptr<TableFactory> table_factory) {
   InternalKey keys[] = {
-    InternalKey("A", 0, ValueType::kTypeValue),
-    InternalKey("B", 0, ValueType::kTypeValue),
-    InternalKey("C", 0, ValueType::kTypeValue),
-    InternalKey("W", 0, ValueType::kTypeDeletion),
-    InternalKey("X", 0, ValueType::kTypeDeletion),
-    InternalKey("Y", 0, ValueType::kTypeDeletion),
-    InternalKey("Z", 0, ValueType::kTypeDeletion),
+    InternalKey("A       ", 0, ValueType::kTypeValue),
+    InternalKey("B       ", 0, ValueType::kTypeValue),
+    InternalKey("C       ", 0, ValueType::kTypeValue),
+    InternalKey("W       ", 0, ValueType::kTypeDeletion),
+    InternalKey("X       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Y       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Z       ", 0, ValueType::kTypeDeletion),
   };
 
-  for (bool sanitized : { false, true }) {
-    std::unique_ptr<TableBuilder> builder;
-    std::unique_ptr<FakeWritableFile> writable;
-    Options options;
-    if (sanitized) {
-      options.table_properties_collectors = {
-        std::make_shared<RegularKeysStartWithA>()
-      };
-      // with sanitization, even regular properties collector will be able to
-      // handle internal keys.
-      auto comparator = options.comparator;
-      // HACK: Set options.info_log to avoid writing log in
-      // SanitizeOptions().
-      options.info_log = std::make_shared<DumbLogger>();
-      options = SanitizeOptions(
-          "db",  // just a place holder
-          nullptr,  // with skip internal key comparator
-          nullptr,  // don't care filter policy
-          options
-      );
-      options.comparator = comparator;
-    } else {
-      options.table_properties_collectors = {
-        std::make_shared<InternalKeyPropertiesCollector>()
-      };
-    }
-
-    MakeBuilder(options, &writable, &builder);
-    for (const auto& k : keys) {
-      builder->Add(k.Encode(), "val");
-    }
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  Options options;
+  options.table_factory = table_factory;
+  if (sanitized) {
+    options.table_properties_collectors = {
+      std::make_shared<RegularKeysStartWithA>()
+    };
+    // with sanitization, even regular properties collector will be able to
+    // handle internal keys.
+    auto comparator = options.comparator;
+    // HACK: Set options.info_log to avoid writing log in
+    // SanitizeOptions().
+    options.info_log = std::make_shared<DumbLogger>();
+    options = SanitizeOptions(
+        "db",  // just a place holder
+        nullptr,  // with skip internal key comparator
+        nullptr,  // don't care filter policy
+        options
+    );
+    options.comparator = comparator;
+  } else {
+    options.table_properties_collectors = {
+      std::make_shared<InternalKeyPropertiesCollector>()
+    };
+  }
 
-    ASSERT_OK(builder->Finish());
+  MakeBuilder(options, &writable, &builder);
+  for (const auto& k : keys) {
+    builder->Add(k.Encode(), "val");
+  }
 
-    std::unique_ptr<TableReader> table_reader;
-    OpenTable(options, writable->contents(), &table_reader);
-    const auto& properties =
-      table_reader->GetTableProperties().user_collected_properties;
+  ASSERT_OK(builder->Finish());
+
+  FakeRandomeAccessFile readable(writable->contents());
+  TableProperties props;
+  Status s = ReadTableProperties(
+      &readable,
+      writable->contents().size(),
+      magic_number,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  ASSERT_OK(s);
 
-    uint64_t deleted = GetDeletedKeys(properties);
-    ASSERT_EQ(4u, deleted);
+  auto user_collected = props.user_collected_properties;
+  uint64_t deleted = GetDeletedKeys(user_collected);
+  ASSERT_EQ(4u, deleted);
 
-    if (sanitized) {
-      uint32_t starts_with_A = 0;
-      Slice key(properties.at("Count"));
-      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
-      ASSERT_EQ(1u, starts_with_A);
-    }
+  if (sanitized) {
+    uint32_t starts_with_A = 0;
+    Slice key(user_collected.at("Count"));
+    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+    ASSERT_EQ(1u, starts_with_A);
   }
 }
 
+TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* not sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kPlainTableMagicNumber,
+      false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(8, 8, 0)
+  );
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 75c8bcc16..c2570acf6 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -64,6 +64,7 @@ struct TablePropertiesNames {
   static const std::string kFilterPolicy;
 };
 
+extern const std::string kPropertiesBlock;
 
 // `TablePropertiesCollector` provides the mechanism for users to collect
 // their own interested properties. This class is essentially a collection
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 61ac193c9..e81d99ede 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -387,7 +387,7 @@ Status BlockBasedTableBuilder::Finish() {
           &properties_block_handle
       );
 
-      meta_index_builer.Add(BlockBasedTable::kPropertiesBlock,
+      meta_index_builer.Add(kPropertiesBlock,
                             properties_block_handle);
     }  // end of properties block writing
 
@@ -459,7 +459,5 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
 
 const std::string BlockBasedTable::kFilterBlockPrefix =
     "filter.";
-const std::string BlockBasedTable::kPropertiesBlock =
-    "rocksdb.properties";
 
 }  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 080daa5a7..dd6d0e7ae 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -21,6 +21,7 @@
 #include "table/block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
 
 #include "util/coding.h"
@@ -250,10 +251,16 @@ Status BlockBasedTable::Open(const Options& options,
 
   // Read the properties
   meta_iter->Seek(kPropertiesBlock);
-  if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) {
+  if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock) {
     s = meta_iter->status();
     if (s.ok()) {
-      s = ReadProperties(meta_iter->value(), rep, &rep->table_properties);
+      s = ReadProperties(
+          meta_iter->value(),
+          rep->file.get(),
+          rep->options.env,
+          rep->options.info_log.get(),
+          &rep->table_properties
+      );
     }
 
     if (!s.ok()) {
@@ -401,96 +408,6 @@ FilterBlockReader* BlockBasedTable::ReadFilter (
        rep->options, block.data, block.heap_allocated);
 }
 
-Status BlockBasedTable::ReadProperties(
-    const Slice& handle_value, Rep* rep, TableProperties* table_properties) {
-  assert(table_properties);
-
-  Slice v = handle_value;
-  BlockHandle handle;
-  if (!handle.DecodeFrom(&v).ok()) {
-    return Status::InvalidArgument("Failed to decode properties block handle");
-  }
-
-  BlockContents block_contents;
-  Status s = ReadBlockContents(
-      rep->file.get(),
-      ReadOptions(),
-      handle,
-      &block_contents,
-      rep->options.env,
-      false
-  );
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  Block properties_block(block_contents);
-  std::unique_ptr<Iterator> iter(
-      properties_block.NewIterator(BytewiseComparator())
-  );
-
-  // All pre-defined properties of type uint64_t
-  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
-    { TablePropertiesNames::kDataSize,
-      &table_properties->data_size },
-    { TablePropertiesNames::kIndexSize,
-      &table_properties->index_size },
-    { TablePropertiesNames::kFilterSize,
-      &table_properties->filter_size },
-    { TablePropertiesNames::kRawKeySize,
-      &table_properties->raw_key_size },
-    { TablePropertiesNames::kRawValueSize,
-      &table_properties->raw_value_size },
-    { TablePropertiesNames::kNumDataBlocks,
-      &table_properties->num_data_blocks },
-    { TablePropertiesNames::kNumEntries,
-      &table_properties->num_entries },
-  };
-
-  std::string last_key;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    s = iter->status();
-    if (!s.ok()) {
-      break;
-    }
-
-    auto key = iter->key().ToString();
-    // properties block is strictly sorted with no duplicate key.
-    assert(
-        last_key.empty() ||
-        BytewiseComparator()->Compare(key, last_key) > 0
-    );
-    last_key = key;
-
-    auto raw_val = iter->value();
-    auto pos = predefined_uint64_properties.find(key);
-
-    if (pos != predefined_uint64_properties.end()) {
-      // handle predefined rocksdb properties
-      uint64_t val;
-      if (!GetVarint64(&raw_val, &val)) {
-        // skip malformed value
-        auto error_msg =
-          "[Warning] detect malformed value in properties meta-block:"
-          "\tkey: " + key + "\tval: " + raw_val.ToString();
-        Log(rep->options.info_log, "%s", error_msg.c_str());
-        continue;
-      }
-      *(pos->second) = val;
-    } else if (key == TablePropertiesNames::kFilterPolicy) {
-      table_properties->filter_policy_name = raw_val.ToString();
-    } else {
-      // handle user-collected
-      table_properties->user_collected_properties.insert(
-          std::make_pair(key, raw_val.ToString())
-      );
-    }
-  }
-
-  return s;
-}
-
 Status BlockBasedTable::GetBlock(
     const BlockBasedTable* table,
     const BlockHandle& handle,
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 05811b5d3..3485a4534 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -38,7 +38,6 @@ using std::unique_ptr;
 class BlockBasedTable : public TableReader {
  public:
   static const std::string kFilterBlockPrefix;
-  static const std::string kPropertiesBlock;
 
   // Attempt to open the table that is stored in bytes [0..file_size)
   // of "file", and read the metadata entries necessary to allow
@@ -142,7 +141,6 @@ class BlockBasedTable : public TableReader {
 
   void ReadMeta(const Footer& footer);
   void ReadFilter(const Slice& filter_handle_value);
-  static Status ReadProperties(const Slice& handle_value, Rep* rep);
 
   // Read the meta block from sst.
   static Status ReadMetaBlock(
@@ -156,10 +154,6 @@ class BlockBasedTable : public TableReader {
       Rep* rep,
       size_t* filter_size = nullptr);
 
-  // Read the table properties from properties block.
-  static Status ReadProperties(
-      const Slice& handle_value, Rep* rep, TableProperties* properties);
-
   static void SetupCacheKeyPrefix(Rep* rep);
 
   explicit BlockBasedTable(Rep* rep) :
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index df3ee5dae..95eb6c4ab 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -8,6 +8,7 @@
 #include <map>
 
 #include "rocksdb/table_properties.h"
+#include "table/block.h"
 #include "table/format.h"
 #include "util/coding.h"
 
@@ -131,4 +132,147 @@ bool NotifyCollectTableCollectorsOnFinish(
   return all_succeeded;
 }
 
+Status ReadProperties(
+    const Slice& handle_value,
+    RandomAccessFile* file,
+    Env* env,
+    Logger* logger,
+    TableProperties* table_properties) {
+  assert(table_properties);
+
+  Slice v = handle_value;
+  BlockHandle handle;
+  if (!handle.DecodeFrom(&v).ok()) {
+    return Status::InvalidArgument("Failed to decode properties block handle");
+  }
+
+  BlockContents block_contents;
+  Status s = ReadBlockContents(
+      file,
+      ReadOptions(),
+      handle,
+      &block_contents,
+      env,
+      false
+  );
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  Block properties_block(block_contents);
+  std::unique_ptr<Iterator> iter(
+      properties_block.NewIterator(BytewiseComparator())
+  );
+
+  // All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+    { TablePropertiesNames::kDataSize, &table_properties->data_size },
+    { TablePropertiesNames::kIndexSize, &table_properties->index_size },
+    { TablePropertiesNames::kFilterSize, &table_properties->filter_size },
+    { TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size },
+    { TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size },
+    { TablePropertiesNames::kNumDataBlocks,
+      &table_properties->num_data_blocks },
+    { TablePropertiesNames::kNumEntries, &table_properties->num_entries },
+  };
+
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block is strictly sorted with no duplicate key.
+    assert(
+        last_key.empty() ||
+        BytewiseComparator()->Compare(key, last_key) > 0
+    );
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (pos != predefined_uint64_properties.end()) {
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+          "[Warning] detect malformed value in properties meta-block:"
+          "\tkey: " + key + "\tval: " + raw_val.ToString();
+        Log(logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      table_properties->filter_policy_name = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      table_properties->user_collected_properties.insert(
+          std::make_pair(key, raw_val.ToString())
+      );
+    }
+  }
+
+  return s;
+}
+
+Status ReadTableProperties(
+    RandomAccessFile* file,
+    uint64_t file_size,
+    uint64_t table_magic_number,
+    Env* env,
+    Logger* info_log,
+    TableProperties* properties) {
+  // -- Read metaindex block
+  Footer footer(table_magic_number);
+  auto s = ReadFooterFromFile(file, file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  s = ReadBlockContents(
+      file,
+      ReadOptions(),
+      metaindex_handle,
+      &metaindex_contents,
+      env,
+      false
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  Block metaindex_block(metaindex_contents);
+  std::unique_ptr<Iterator> meta_iter(
+      metaindex_block.NewIterator(BytewiseComparator())
+  );
+
+  // -- Read property block
+  meta_iter->Seek(kPropertiesBlock);
+  TableProperties table_properties;
+  if (meta_iter->Valid() &&
+      meta_iter->key() == kPropertiesBlock &&
+      meta_iter->status().ok()) {
+    s = ReadProperties(
+        meta_iter->value(),
+        file,
+        env,
+        info_log,
+        properties
+    );
+  } else {
+    s = Status::Corruption(
+        "Unable to read the property block from the plain table"
+    );
+  }
+
+  return s;
+}
+
+
 }  // namespace rocksdb
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index d0718ec07..a773c7b38 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -15,9 +15,11 @@
 
 namespace rocksdb {
 
-class BlockHandle;
 class BlockBuilder;
+class BlockHandle;
+class Env;
 class Logger;
+class RandomAccessFile;
 struct TableProperties;
 
 // An STL style comparator that does the bytewise comparator comparasion
@@ -49,11 +51,6 @@ class MetaIndexBuilder {
   Slice Finish();
 
  private:
-  //   * Key: meta block name
-  //   * Value: block handle to that meta block
-  struct Rep;
-  Rep* rep_;
-
   // store the sorted key/handle of the metablocks.
   BytewiseSortedMap meta_block_handles_;
   std::unique_ptr<BlockBuilder> meta_index_block_;
@@ -103,4 +100,21 @@ bool NotifyCollectTableCollectorsOnFinish(
     Logger* info_log,
     PropertyBlockBuilder* builder);
 
+// Read the properties from the table.
+Status ReadProperties(
+    const Slice& handle_value,
+    RandomAccessFile* file,
+    Env* env,
+    Logger* logger,
+    TableProperties* table_properties);
+
+// Directly read the properties from the properties block of a plain table.
+Status ReadTableProperties(
+    RandomAccessFile* file,
+    uint64_t file_size,
+    uint64_t table_magic_number,
+    Env* env,
+    Logger* info_log,
+    TableProperties* properties);
+
 }  // namespace rocksdb
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 30d7e7d6e..970599a9b 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -14,12 +14,40 @@
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
 
+namespace {
+
+// a utility that helps writing block content to the file
+//   @offset will advance if @block_contents was successfully written.
+//   @block_handle the block handle this particular block.
+Status WriteBlock(
+    const Slice& block_contents,
+    WritableFile* file,
+    uint64_t* offset,
+    BlockHandle* block_handle) {
+  block_handle->set_offset(*offset);
+  block_handle->set_size(block_contents.size());
+  Status s = file->Append(block_contents);
+
+  if (s.ok()) {
+    *offset += block_contents.size();
+  }
+  return s;
+}
+
+}  // namespace
+
+// kPlainTableMagicNumber was picked by running
+//    echo rocksdb.plain.table | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
 PlainTableBuilder::PlainTableBuilder(const Options& options,
                                      WritableFile* file,
                                      int user_key_size, int key_prefix_len) :
@@ -28,13 +56,16 @@ PlainTableBuilder::PlainTableBuilder(const Options& options,
   PutFixed32(&version, 1 | 0x80000000);
   file_->Append(Slice(version));
   offset_ = 4;
-}
 
-PlainTableBuilder::~PlainTableBuilder() {
+  // for plain table, we put all the data in a big chuck.
+  properties_.num_data_blocks = 1;
+  // emphasize that currently plain table doesn't have persistent index or
+  // filter block.
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
 }
 
-Status PlainTableBuilder::ChangeOptions(const Options& options) {
-  return Status::OK();
+PlainTableBuilder::~PlainTableBuilder() {
 }
 
 void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
@@ -52,7 +83,17 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
   file_->Append(value);
   offset_ += value_size + size.length();
 
-  num_entries_++;
+  properties_.num_entries++;
+  properties_.raw_key_size += key.size();
+  properties_.raw_value_size += value.size();
+
+  // notify property collectors
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      options_.table_properties_collectors,
+      options_.info_log.get()
+  );
 }
 
 Status PlainTableBuilder::status() const {
@@ -62,7 +103,63 @@ Status PlainTableBuilder::status() const {
 Status PlainTableBuilder::Finish() {
   assert(!closed_);
   closed_ = true;
-  return Status::OK();
+
+  properties_.data_size = offset_;
+
+  // Write the following blocks
+  //  1. [meta block: properties]
+  //  2. [metaindex block]
+  //  3. [footer]
+  MetaIndexBuilder meta_index_builer;
+
+  PropertyBlockBuilder property_block_builder;
+  // -- Add basic properties
+  property_block_builder.AddTableProperty(properties_);
+
+  // -- Add user collected properties
+  NotifyCollectTableCollectorsOnFinish(
+      options_.table_properties_collectors,
+      options_.info_log.get(),
+      &property_block_builder
+  );
+
+  // -- Write property block
+  BlockHandle property_block_handle;
+  auto s = WriteBlock(
+      property_block_builder.Finish(),
+      file_,
+      &offset_,
+      &property_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  meta_index_builer.Add(kPropertiesBlock, property_block_handle);
+
+  // -- write metaindex block
+  BlockHandle metaindex_block_handle;
+  s = WriteBlock(
+      meta_index_builer.Finish(),
+      file_,
+      &offset_,
+      &metaindex_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Write Footer
+  Footer footer(kPlainTableMagicNumber);
+  footer.set_metaindex_handle(metaindex_block_handle);
+  footer.set_index_handle(BlockHandle::NullBlockHandle());
+  std::string footer_encoding;
+  footer.EncodeTo(&footer_encoding);
+  s = file_->Append(footer_encoding);
+  if (s.ok()) {
+    offset_ += footer_encoding.size();
+  }
+
+  return s;
 }
 
 void PlainTableBuilder::Abandon() {
@@ -70,7 +167,7 @@ void PlainTableBuilder::Abandon() {
 }
 
 uint64_t PlainTableBuilder::NumEntries() const {
-  return num_entries_;
+  return properties_.num_entries;
 }
 
 uint64_t PlainTableBuilder::FileSize() const {
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
index a994b337c..b8a2bbe3b 100644
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -32,14 +32,6 @@ public:
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~PlainTableBuilder();
 
-  // Change the options used by this builder.  Note: only some of the
-  // option fields can be changed after construction.  If a field is
-  // not allowed to change dynamically and its value in the structure
-  // passed to the constructor is different from its value in the
-  // structure passed to this method, this method will return an error
-  // without changing any fields.
-  Status ChangeOptions(const Options& options);
-
   // Add key,value to the table being constructed.
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
@@ -72,7 +64,7 @@ private:
   WritableFile* file_;
   uint64_t offset_ = 0;
   Status status_;
-  uint64_t num_entries_ = 0;
+  TableProperties properties_;
 
   const size_t user_key_size_;
   bool closed_ = false;  // Either Finish() or Abandon() has been called.
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index e808948ab..1562f7cfd 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -19,6 +19,7 @@
 #include "table/block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
 
 #include "util/coding.h"
@@ -41,6 +42,7 @@ public:
 
 namespace rocksdb {
 
+extern const uint64_t kPlainTableMagicNumber;
 static uint32_t getBucketId(Slice const& s, size_t prefix_len,
                             uint32_t num_buckets) {
   return MurmurHash(s.data(), prefix_len, 397) % num_buckets;
@@ -49,18 +51,16 @@ static uint32_t getBucketId(Slice const& s, size_t prefix_len,
 PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
                                    uint64_t file_size, int user_key_size,
                                    int key_prefix_len, int bloom_bits_per_key,
-                                   double hash_table_ratio) :
+                                   double hash_table_ratio,
+                                   const TableProperties& table_properties) :
     hash_table_size_(0), soptions_(storage_options), file_size_(file_size),
     user_key_size_(user_key_size), key_prefix_len_(key_prefix_len),
-    hash_table_ratio_(hash_table_ratio) {
-  if (bloom_bits_per_key > 0) {
-    filter_policy_ = NewBloomFilterPolicy(bloom_bits_per_key);
-  } else {
-    filter_policy_ = nullptr;
-  }
-  hash_table_ = nullptr;
-  data_start_offset_ = 0;
-  data_end_offset_ = file_size;
+    hash_table_ratio_(hash_table_ratio),
+    filter_policy_(bloom_bits_per_key > 0 ?
+                     NewBloomFilterPolicy(bloom_bits_per_key) : nullptr),
+    table_properties_(table_properties),
+    data_start_offset_(0),
+    data_end_offset_(table_properties_.data_size) {
 }
 
 PlainTableReader::~PlainTableReader() {
@@ -87,19 +87,38 @@ Status PlainTableReader::Open(const Options& options,
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
-  PlainTableReader* t = new PlainTableReader(soptions, file_size,
-                                             user_key_size,
-                                             key_prefix_len,
-                                             bloom_num_bits,
-                                             hash_table_ratio);
-  t->file_ = std::move(file);
-  t->options_ = options;
-  Status s = t->PopulateIndex(file_size);
+  TableProperties table_properties;
+  auto s = ReadTableProperties(
+      file.get(),
+      file_size,
+      kPlainTableMagicNumber,
+      options.env,
+      options.info_log.get(),
+      &table_properties
+  );
   if (!s.ok()) {
-    delete t;
     return s;
   }
-  table_reader->reset(t);
+
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      soptions,
+      file_size,
+      user_key_size,
+      key_prefix_len,
+      bloom_num_bits,
+      hash_table_ratio,
+      table_properties
+  ));
+  new_reader->file_ = std::move(file);
+  new_reader->options_ = options;
+
+  // -- Populate Index
+  s = new_reader->PopulateIndex();
+  if (!s.ok()) {
+    return s;
+  }
+
+  *table_reader = std::move(new_reader);
   return s;
 }
 
@@ -114,7 +133,7 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
   return new PlainTableIterator(this);
 }
 
-Status PlainTableReader::PopulateIndex(uint64_t file_size) {
+Status PlainTableReader::PopulateIndex() {
   // Get mmapped memory to file_data_.
   Status s = file_->Read(0, file_size_, &file_data_, nullptr);
   if (!s.ok()) {
@@ -124,7 +143,6 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
   version_ ^= 0x80000000;
   assert(version_ == 1);
   data_start_offset_ = 4;
-  data_end_offset_ = file_size;
 
   Slice key_slice;
   Slice key_prefix_slice;
@@ -140,7 +158,7 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
   // are in order.
   std::vector<std::pair<Slice, std::string>> prefix_index_pairs;
   std::string current_prefix_index;
-  while (pos < file_size) {
+  while (pos < data_end_offset_) {
     uint32_t key_offset = pos;
     status_ = Next(pos, &key_slice, &value_slice, pos);
     key_prefix_slice = Slice(key_slice.data(), key_prefix_len_);
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index eea8adfe6..26a506d14 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -78,16 +78,21 @@ public:
   void SetupForCompaction();
 
   TableProperties& GetTableProperties() {
-    return tbl_props;
+    return table_properties_;
   }
 
-  PlainTableReader(const EnvOptions& storage_options, uint64_t file_size,
-                   int user_key_size, int key_prefix_len, int bloom_num_bits,
-                   double hash_table_ratio);
+  PlainTableReader(
+      const EnvOptions& storage_options,
+      uint64_t file_size,
+      int user_key_size,
+      int key_prefix_len,
+      int bloom_num_bits,
+      double hash_table_ratio,
+      const TableProperties& table_properties);
   ~PlainTableReader();
 
 private:
-  uint32_t* hash_table_;
+  uint32_t* hash_table_ = nullptr;
   int hash_table_size_;
   std::string sub_index_;
 
@@ -99,8 +104,6 @@ private:
   Slice file_data_;
   uint32_t version_;
   uint32_t file_size_;
-  uint32_t data_start_offset_;
-  uint32_t data_end_offset_;
   const size_t user_key_size_;
   const size_t key_prefix_len_;
   const double hash_table_ratio_;
@@ -108,7 +111,9 @@ private:
   std::string filter_str_;
   Slice filter_slice_;
 
-  TableProperties tbl_props;
+  TableProperties table_properties_;
+  uint32_t data_start_offset_;
+  uint32_t data_end_offset_;
 
   static const size_t kNumInternalBytes = 8;
   static const uint32_t kSubIndexMask = 0x80000000;
@@ -125,7 +130,7 @@ private:
   // any query to the table.
   // This query will populate the hash table hash_table_, the second
   // level of indexes sub_index_ and bloom filter filter_slice_ if enabled.
-  Status PopulateIndex(uint64_t file_size);
+  Status PopulateIndex();
 
   // Check bloom filter to see whether it might contain this prefix
   bool MayHavePrefix(const Slice& target_prefix);
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 2c9905884..47e7f8b33 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -105,4 +105,6 @@ const std::string TablePropertiesNames::kNumEntries =
 const std::string TablePropertiesNames::kFilterPolicy =
     "rocksdb.filter.policy";
 
+extern const std::string kPropertiesBlock = "rocksdb.properties";
+
 }  // namespace rocksdb
diff --git a/table/table_test.cc b/table/table_test.cc
index 1f79fcdf9..7711ed8ad 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -15,17 +15,22 @@
 #include "db/db_statistics.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
+#include "rocksdb/plain_table_factory.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/memtablerep.h"
+
+#include "table/meta_blocks.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
 #include "table/block.h"
 #include "table/format.h"
+
 #include "util/random.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -743,49 +748,6 @@ class Harness {
   Constructor* constructor_;
 };
 
-// Test the empty key
-TEST(Harness, SimpleEmptyKey) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 1);
-    Add("", "v");
-    Test(&rnd);
-  }
-}
-
-TEST(Harness, SimpleSingle) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 2);
-    Add("abc", "v");
-    Test(&rnd);
-  }
-}
-
-TEST(Harness, SimpleMulti) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 3);
-    Add("abc", "v");
-    Add("abcd", "v");
-    Add("ac", "v2");
-    Test(&rnd);
-  }
-}
-
-TEST(Harness, SimpleSpecialKey) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 4);
-    Add("\xff\xff", "v3");
-    Test(&rnd);
-  }
-}
-
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
   if (!result) {
@@ -801,7 +763,7 @@ class TableTest { };
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
-TEST(TableTest, BasicTableProperties) {
+TEST(TableTest, BasicBlockedBasedTableProperties) {
   BlockBasedTableConstructor c(BytewiseComparator());
 
   c.Add("a1", "val1");
@@ -845,6 +807,47 @@ TEST(TableTest, BasicTableProperties) {
   );
 }
 
+extern const uint64_t kPlainTableMagicNumber;
+TEST(TableTest, BasicPlainTableProperties) {
+  PlainTableFactory factory(8, 8, 0);
+  StringSink sink;
+  std::unique_ptr<TableBuilder> builder(factory.GetTableBuilder(
+      Options(),
+      &sink,
+      kNoCompression
+  ));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(16, c);
+    std::string value(28, c + 42);
+    builder->Add(key, value);
+  }
+  ASSERT_OK(builder->Finish());
+
+  StringSource source(sink.contents(), 72242);
+
+  TableProperties props;
+  auto s = ReadTableProperties(
+      &source,
+      sink.contents().size(),
+      kPlainTableMagicNumber,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  ASSERT_OK(s);
+
+  ASSERT_EQ(0ul, props.index_size);
+  ASSERT_EQ(0ul, props.filter_size);
+  ASSERT_EQ(16ul * 26, props.raw_key_size);
+  ASSERT_EQ(28ul * 26, props.raw_value_size);
+  ASSERT_EQ(26ul, props.num_entries);
+  ASSERT_EQ(1ul, props.num_data_blocks);
+
+  // User collected keys
+  // internal keys
+}
+
 TEST(TableTest, FilterPolicyNameProperties) {
   BlockBasedTableConstructor c(BytewiseComparator());
   c.Add("a1", "val1");
@@ -1292,6 +1295,48 @@ TEST(MemTableTest, Simple) {
   delete memtable->Unref();
 }
 
+// Test the empty key
+TEST(Harness, SimpleEmptyKey) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 1);
+    Add("", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSingle) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 2);
+    Add("abc", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleMulti) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 3);
+    Add("abc", "v");
+    Add("abcd", "v");
+    Add("ac", "v2");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSpecialKey) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 4);
+    Add("\xff\xff", "v3");
+    Test(&rnd);
+  }
+}
 
 }  // namespace rocksdb
 

From 28c24de8be0cdb9917fd7832f19d2e6137e789b6 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 13 Dec 2013 18:20:32 -0800
Subject: [PATCH 38/70] [RocksDB Peformance Branch] A bug in PlainTable format

Summary: A bug to fix. IT's already fixed in D14457, but want to check it in sooner to unblock tests

Test Plan: plain_table_db_test

Reviewers: nkg-, haobo

Reviewed By: nkg-

CC: kailiu, leveldb

Differential Revision: https://reviews.facebook.net/D14673
---
 table/plain_table_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 1562f7cfd..5f68a183a 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -279,7 +279,7 @@ uint32_t PlainTableReader::GetOffset(const Slice& target,
     const char* index_offset = sub_index_.data() + base_offset
         + kOffsetLen * mid;
     uint32_t file_offset = DecodeFixed32(index_offset);
-    mid_key = Slice(file_data_.data() + file_offset, user_key_size_);
+    mid_key = Slice(file_data_.data() + file_offset, GetInternalKeyLength());
 
     int cmp_result = options_.comparator->Compare(target, mid_key);
     if (cmp_result > 0) {

From abaf26266d5bbefee8dc3498fe2094efba443f18 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 20 Dec 2013 09:35:24 -0800
Subject: [PATCH 39/70] [RocksDB] [Performance Branch] Some Changes to
 PlainTable format

Summary:
Some changes to PlainTable format:
(1) support variable key length
(2) use user defined slice transformer to extract prefixes
(3) Run some test cases against PlainTable in db_test and table_test

Test Plan: test db_test

Reviewers: haobo, kailiu

CC: dhruba, igor, leveldb, nkg-

Differential Revision: https://reviews.facebook.net/D14457
---
 db/db_test.cc                         |  71 ++--
 db/plain_table_db_test.cc             |   3 +-
 include/rocksdb/plain_table_factory.h |  45 ++-
 include/rocksdb/table_properties.h    |   6 +
 table/meta_blocks.cc                  |   4 +
 table/plain_table_builder.cc          |  37 ++-
 table/plain_table_builder.h           |  11 +-
 table/plain_table_factory.cc          |   5 +-
 table/plain_table_reader.cc           | 455 ++++++++++++++++++--------
 table/plain_table_reader.h            | 107 +++---
 table/table_properties.cc             |   4 +
 table/table_reader_bench.cc           |   6 +-
 table/table_test.cc                   | 305 +++++++++++------
 util/dynamic_bloom.cc                 |  10 +-
 util/dynamic_bloom.h                  |   5 +
 15 files changed, 716 insertions(+), 358 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 27e90f110..10babbac6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -23,6 +23,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/perf_context.h"
+#include "rocksdb/plain_table_factory.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
@@ -244,6 +245,8 @@ class DBTest {
   // Sequence of option configurations to try
   enum OptionConfig {
     kDefault,
+    kPlainTableFirstBytePrefix,
+    kPlainTableAllBytesPrefix,
     kVectorRep,
     kMergePut,
     kFilter,
@@ -275,7 +278,8 @@ class DBTest {
     kNoSkip = 0,
     kSkipDeletesFilterFirst = 1,
     kSkipUniversalCompaction = 2,
-    kSkipMergePut = 4
+    kSkipMergePut = 4,
+    kSkipPlainTable = 8
   };
 
   DBTest() : option_config_(kDefault),
@@ -297,20 +301,27 @@ class DBTest {
   // Switch to a fresh database with the next option configuration to
   // test.  Return false if there are no more configurations to test.
   bool ChangeOptions(int skip_mask = kNoSkip) {
-    option_config_++;
-
     // skip some options
-    if (skip_mask & kSkipDeletesFilterFirst &&
-        option_config_ == kDeletesFilterFirst) {
-      option_config_++;
-    }
-    if (skip_mask & kSkipUniversalCompaction &&
-        option_config_ == kUniversalCompaction) {
-      option_config_++;
-    }
-    if (skip_mask & kSkipMergePut && option_config_ == kMergePut) {
-      option_config_++;
+    for(option_config_++; option_config_ < kEnd; option_config_++) {
+      if ((skip_mask & kSkipDeletesFilterFirst) &&
+          option_config_ == kDeletesFilterFirst) {
+        continue;
+      }
+      if ((skip_mask & kSkipUniversalCompaction) &&
+          option_config_ == kUniversalCompaction) {
+        continue;
+      }
+      if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
+        continue;
+      }
+      if ((skip_mask & kSkipPlainTable)
+          && (option_config_ == kPlainTableAllBytesPrefix
+              || option_config_ == kPlainTableFirstBytePrefix)) {
+        continue;
+      }
+      break;
     }
+
     if (option_config_ >= kEnd) {
       Destroy(&last_options_);
       return false;
@@ -343,6 +354,18 @@ class DBTest {
         options.memtable_factory.reset(
             NewHashSkipListRepFactory(NewFixedPrefixTransform(1)));
         break;
+      case kPlainTableFirstBytePrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor = NewFixedPrefixTransform(1);
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        break;
+      case kPlainTableAllBytesPrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor = NewNoopTransform();
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        break;
       case kMergePut:
         options.merge_operator = MergeOperators::CreatePutOperator();
         break;
@@ -1009,7 +1032,10 @@ TEST(DBTest, KeyMayExist) {
               options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
 
     delete options.filter_policy;
-  } while (ChangeOptions());
+
+    // KeyMayExist function only checks data in block caches, which is not used
+    // by plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
 }
 
 TEST(DBTest, NonBlockingIteration) {
@@ -1073,7 +1099,9 @@ TEST(DBTest, NonBlockingIteration) {
               options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
     delete iter;
 
-  } while (ChangeOptions());
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+  } while (ChangeOptions(kSkipPlainTable));
 }
 
 // A delete is skipped for key if KeyMayExist(key) returns False
@@ -2932,7 +2960,8 @@ TEST(DBTest, ApproximateSizes) {
       ASSERT_EQ(NumTableFilesAtLevel(0), 0);
       ASSERT_GT(NumTableFilesAtLevel(1), 0);
     }
-  } while (ChangeOptions(kSkipUniversalCompaction));
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
 }
 
 TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
@@ -2970,7 +2999,8 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
 
       dbfull()->TEST_CompactRange(0, nullptr, nullptr);
     }
-  } while (ChangeOptions());
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
 }
 
 TEST(DBTest, IteratorPinsRef) {
@@ -3054,7 +3084,9 @@ TEST(DBTest, HiddenValuesAreRemoved) {
     ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
 
     ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
-  } while (ChangeOptions(kSkipUniversalCompaction));
+    // ApproximateOffsetOf() is not yet implemented in plain table format,
+    // which is used by Size().
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
 }
 
 TEST(DBTest, CompactBetweenSnapshots) {
@@ -4626,7 +4658,8 @@ TEST(DBTest, Randomized) {
       // TODO(sanjay): Test Get() works
       int p = rnd.Uniform(100);
       int minimum = 0;
-      if (option_config_ == kHashSkipList) {
+      if (option_config_ == kHashSkipList ||
+          option_config_ == kPlainTableFirstBytePrefix) {
         minimum = 1;
       }
       if (p < 45) {                               // Put
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 0baf56ecd..17f871e4c 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -59,7 +59,8 @@ public:
   // Return the current option configuration.
   Options CurrentOptions() {
     Options options;
-    options.table_factory.reset(new PlainTableFactory(16, 8, 2, 0.8));
+    options.table_factory.reset(new PlainTableFactory(16, 2, 0.8));
+    options.prefix_extractor = NewFixedPrefixTransform(8);
     options.allow_mmap_reads = true;
     return options;
   }
diff --git a/include/rocksdb/plain_table_factory.h b/include/rocksdb/plain_table_factory.h
index 2355e43d4..5cf59d23a 100644
--- a/include/rocksdb/plain_table_factory.h
+++ b/include/rocksdb/plain_table_factory.h
@@ -23,41 +23,37 @@ class TableBuilder;
 
 // IndexedTable requires fixed length key, configured as a constructor
 // parameter of the factory class. Output file format:
-// +-------------+
-// | version     |
-// +-------------+------------------------------+  <= key1 offset
-// | key1            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
+// +-------------+-----------------+
+// | version     | user_key_length |
+// +------------++------------------------------+  <= key1 offset
+// | [key_size] |  key1       | value_size  |   |
+// +------------+-------------+-------------+   |
 // | value1                                     |
 // |                                            |
 // +----------------------------------------+---+  <= key2 offset
-// | key2            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
+// | [key_size] |  key2       | value_size  |   |
+// +------------+-------------+-------------+   |
 // | value2                                     |
 // |                                            |
 // |        ......                              |
-// +-----------------+--------------------------+   <= index_block_offset
-// | key1            | key1 offset (8 bytes)    |
 // +-----------------+--------------------------+
-// | key2            | key2 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// | key3            | key3 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// |        ......                              |
-// +-----------------+------------+-------------+
+// If user_key_length = kVariableLength, it means the key is variable length,
+// there will be an extra field for key size encoded before every key.
 class PlainTableFactory: public TableFactory {
 public:
   ~PlainTableFactory() {
   }
-  // user_key_size is the length of the user key. key_prefix_len is the
-  // length of the prefix used for in-memory indexes. bloom_num_bits is
+  // user_key_size is the length of the user key. If it is set to be
+  // kVariableLength, then it means variable length. Otherwise, all the
+  // keys need to have the fix length of this value. bloom_num_bits is
   // number of bits used for bloom filer per key. hash_table_ratio is
-  // the desired ultilization of the hash table used for prefix hashing.
+  // the desired utilization of the hash table used for prefix hashing.
   // hash_table_ratio = number of prefixes / #buckets in the hash table
-  PlainTableFactory(int user_key_size, int key_prefix_len,
-                    int bloom_num_bits = 0, double hash_table_ratio = 0.75) :
-      user_key_size_(user_key_size), key_prefix_len_(key_prefix_len),
-      bloom_num_bits_(bloom_num_bits), hash_table_ratio_(hash_table_ratio) {
+  explicit PlainTableFactory(uint32_t user_key_len = kVariableLength,
+                             int bloom_num_bits = 0,
+                             double hash_table_ratio = 0.75) :
+      user_key_len_(user_key_len), bloom_num_bits_(bloom_num_bits),
+      hash_table_ratio_(hash_table_ratio) {
   }
   const char* Name() const override {
     return "PlainTable";
@@ -70,9 +66,10 @@ public:
   TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
                                 CompressionType compression_type) const
                                     override;
+
+  static const uint32_t kVariableLength = 0;
 private:
-  int user_key_size_;
-  int key_prefix_len_;
+  uint32_t user_key_len_;
   int bloom_num_bits_;
   double hash_table_ratio_;
 };
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index c2570acf6..b1b52e87a 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -37,6 +37,10 @@ struct TableProperties {
   uint64_t num_data_blocks = 0;
   // the number of entries in this table
   uint64_t num_entries = 0;
+  // format version, reserved for backward compatibility
+  uint64_t format_version = 0;
+  // If 0, key is variable length. Otherwise number of bytes for each key.
+  uint64_t fixed_key_len = 0;
 
   // The name of the filter policy used in this table.
   // If no filter policy is used, `filter_policy_name` will be an empty string.
@@ -61,6 +65,8 @@ struct TablePropertiesNames {
   static const std::string kRawValueSize;
   static const std::string kNumDataBlocks;
   static const std::string kNumEntries;
+  static const std::string kFormatVersion;
+  static const std::string kFixedKeyLen;
   static const std::string kFilterPolicy;
 };
 
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 95eb6c4ab..5d2d94175 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -67,6 +67,8 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   Add(TablePropertiesNames::kNumEntries, props.num_entries);
   Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
   Add(TablePropertiesNames::kFilterSize, props.filter_size);
+  Add(TablePropertiesNames::kFormatVersion, props.format_version);
+  Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
 
   if (!props.filter_policy_name.empty()) {
     Add(TablePropertiesNames::kFilterPolicy,
@@ -175,6 +177,8 @@ Status ReadProperties(
     { TablePropertiesNames::kNumDataBlocks,
       &table_properties->num_data_blocks },
     { TablePropertiesNames::kNumEntries, &table_properties->num_entries },
+    { TablePropertiesNames::kFormatVersion, &table_properties->format_version },
+    { TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len },
   };
 
   std::string last_key;
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 970599a9b..5c3252360 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -50,12 +50,9 @@ extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
 
 PlainTableBuilder::PlainTableBuilder(const Options& options,
                                      WritableFile* file,
-                                     int user_key_size, int key_prefix_len) :
-    options_(options), file_(file), user_key_size_(user_key_size) {
-  std::string version;
-  PutFixed32(&version, 1 | 0x80000000);
-  file_->Append(Slice(version));
-  offset_ = 4;
+                                     uint32_t user_key_len) :
+    options_(options), file_(file), user_key_len_(user_key_len) {
+  properties_.fixed_key_len = user_key_len;
 
   // for plain table, we put all the data in a big chuck.
   properties_.num_data_blocks = 1;
@@ -63,25 +60,37 @@ PlainTableBuilder::PlainTableBuilder(const Options& options,
   // filter block.
   properties_.index_size = 0;
   properties_.filter_size = 0;
+  properties_.format_version = 0;
 }
 
 PlainTableBuilder::~PlainTableBuilder() {
 }
 
 void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
-  assert((int) key.size() == GetInternalKeyLength());
+  assert(user_key_len_ == 0 || key.size() == user_key_len_ + 8);
+
+  if (!IsFixedLength()) {
+    // Write key length
+    int key_size = key.size();
+    key_size_str_.clear();
+    PutVarint32(&key_size_str_, key_size);
+    file_->Append(key_size_str_);
+    offset_ += key_size_str_.length();
+  }
 
-  // Write key-value pair
+  // Write key
   file_->Append(key);
-  offset_ += GetInternalKeyLength();
+  offset_ += key.size();
 
-  std::string size;
+  // Write value length
+  value_size_str_.clear();
   int value_size = value.size();
-  PutVarint32(&size, value_size);
-  Slice sizeSlice(size);
-  file_->Append(sizeSlice);
+  PutVarint32(&value_size_str_, value_size);
+  file_->Append(value_size_str_);
+
+  // Write value
   file_->Append(value);
-  offset_ += value_size + size.length();
+  offset_ += value_size + value_size_str_.length();
 
   properties_.num_entries++;
   properties_.raw_key_size += key.size();
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
index b8a2bbe3b..f4be46828 100644
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -27,7 +27,7 @@ public:
   // will be part of level specified by 'level'.  A value of -1 means
   // that the caller does not know which level the output file will reside.
   PlainTableBuilder(const Options& options, WritableFile* file,
-                    int user_key_size, int key_prefix_len);
+                    uint32_t user_key_size);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~PlainTableBuilder();
@@ -66,11 +66,14 @@ private:
   Status status_;
   TableProperties properties_;
 
-  const size_t user_key_size_;
+  const size_t user_key_len_;
   bool closed_ = false;  // Either Finish() or Abandon() has been called.
 
-  int GetInternalKeyLength() {
-    return user_key_size_ + 8;
+  std::string key_size_str_;
+  std::string value_size_str_;
+
+  bool IsFixedLength() const {
+    return user_key_len_ > 0;
   }
 
   // No copying allowed
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index 08e75c4ec..bf941a62d 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -19,13 +19,12 @@ Status PlainTableFactory::GetTableReader(const Options& options,
                                          unique_ptr<TableReader>* table)
      const {
   return PlainTableReader::Open(options, soptions, std::move(file), file_size,
-                                  table, user_key_size_, key_prefix_len_,
-                                  bloom_num_bits_, hash_table_ratio_);
+                                table, bloom_num_bits_, hash_table_ratio_);
 }
 
 TableBuilder* PlainTableFactory::GetTableBuilder(
     const Options& options, WritableFile* file,
     CompressionType compression_type) const {
-  return new PlainTableBuilder(options, file, user_key_size_, key_prefix_len_);
+  return new PlainTableBuilder(options, file, user_key_len_);
 }
 }  // namespace rocksdb
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 5f68a183a..4c396a359 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -15,6 +15,7 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/plain_table_factory.h"
 
 #include "table/block.h"
 #include "table/filter_block.h"
@@ -23,6 +24,7 @@
 #include "table/two_level_iterator.h"
 
 #include "util/coding.h"
+#include "util/dynamic_bloom.h"
 #include "util/hash.h"
 #include "util/histogram.h"
 #include "util/murmurhash.h"
@@ -30,46 +32,36 @@
 #include "util/stop_watch.h"
 
 
-namespace std {
-template<>
-struct hash<rocksdb::Slice> {
-public:
-  std::size_t operator()(rocksdb::Slice const& s) const {
-    return MurmurHash(s.data(), s.size(), 397);
-  }
-};
-}
-
 namespace rocksdb {
 
 extern const uint64_t kPlainTableMagicNumber;
-static uint32_t getBucketId(Slice const& s, size_t prefix_len,
-                            uint32_t num_buckets) {
-  return MurmurHash(s.data(), prefix_len, 397) % num_buckets;
+
+static uint32_t GetSliceHash(Slice const& s) {
+  return Hash(s.data(), s.size(), 397) ;
+}
+static uint32_t getBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  return hash % num_buckets;
 }
 
 PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
-                                   uint64_t file_size, int user_key_size,
-                                   int key_prefix_len, int bloom_bits_per_key,
+                                   uint64_t file_size, int bloom_bits_per_key,
                                    double hash_table_ratio,
                                    const TableProperties& table_properties) :
     hash_table_size_(0), soptions_(storage_options), file_size_(file_size),
-    user_key_size_(user_key_size), key_prefix_len_(key_prefix_len),
     hash_table_ratio_(hash_table_ratio),
-    filter_policy_(bloom_bits_per_key > 0 ?
-                     NewBloomFilterPolicy(bloom_bits_per_key) : nullptr),
-    table_properties_(table_properties),
-    data_start_offset_(0),
-    data_end_offset_(table_properties_.data_size) {
+    bloom_bits_per_key_(bloom_bits_per_key),
+    table_properties_(table_properties), data_start_offset_(0),
+    data_end_offset_(table_properties_.data_size),
+    user_key_len_(table_properties.fixed_key_len) {
+  hash_table_ = nullptr;
+  bloom_ = nullptr;
+  sub_index_ = nullptr;
 }
 
 PlainTableReader::~PlainTableReader() {
-  if (hash_table_ != nullptr) {
-    delete[] hash_table_;
-  }
-  if (filter_policy_ != nullptr) {
-    delete filter_policy_;
-  }
+  delete[] hash_table_;
+  delete[] sub_index_;
+  delete bloom_;
 }
 
 Status PlainTableReader::Open(const Options& options,
@@ -77,8 +69,6 @@ Status PlainTableReader::Open(const Options& options,
                               unique_ptr<RandomAccessFile> && file,
                               uint64_t file_size,
                               unique_ptr<TableReader>* table_reader,
-                              const int user_key_size,
-                              const int key_prefix_len,
                               const int bloom_num_bits,
                               double hash_table_ratio) {
   assert(options.allow_mmap_reads);
@@ -103,8 +93,6 @@ Status PlainTableReader::Open(const Options& options,
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
       soptions,
       file_size,
-      user_key_size,
-      key_prefix_len,
       bloom_num_bits,
       hash_table_ratio,
       table_properties
@@ -133,22 +121,69 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
   return new PlainTableIterator(this);
 }
 
-Status PlainTableReader::PopulateIndex() {
-  // Get mmapped memory to file_data_.
-  Status s = file_->Read(0, file_size_, &file_data_, nullptr);
-  if (!s.ok()) {
-    return s;
+struct PlainTableReader::IndexRecord {
+  uint32_t hash; // hash of the prefix
+  uint32_t offset; // offset of a row
+  IndexRecord* next;
+};
+
+// Helper class to track all the index records
+class PlainTableReader::IndexRecordList {
+public:
+  explicit IndexRecordList(size_t num_records_per_group) :
+      num_records_per_group_(num_records_per_group),
+      current_group_(nullptr),
+      num_records_in_current_group_(num_records_per_group) {
+  }
+
+  ~IndexRecordList() {
+    for (size_t i = 0; i < groups_.size(); i++) {
+      delete[] groups_[i];
+    }
+  }
+
+  void AddRecord(murmur_t hash, uint32_t offset) {
+    if (num_records_in_current_group_ == num_records_per_group_) {
+      current_group_ = AllocateNewGroup();
+      num_records_in_current_group_ = 0;
+    }
+    auto& new_record = current_group_[num_records_in_current_group_];
+    new_record.hash = hash;
+    new_record.offset = offset;
+    new_record.next = nullptr;
+    num_records_in_current_group_++;
+  }
+
+  size_t GetNumRecords() {
+    return (groups_.size() - 1) * num_records_per_group_
+        + num_records_in_current_group_;
+  }
+  IndexRecord* At(size_t index) {
+    return &(groups_[index / num_records_per_group_]
+                    [index % num_records_per_group_]);
+  }
+
+  IndexRecord* AllocateNewGroup() {
+    IndexRecord* result = new IndexRecord[num_records_per_group_];
+    groups_.push_back(result);
+    return result;
   }
-  version_ = DecodeFixed32(file_data_.data());
-  version_ ^= 0x80000000;
-  assert(version_ == 1);
-  data_start_offset_ = 4;
+private:
+  const size_t num_records_per_group_;
+  IndexRecord* current_group_;
+  // List of arrays allocated
+  std::vector<IndexRecord*> groups_;
+  size_t num_records_in_current_group_;
+};
 
+int PlainTableReader::PopulateIndexRecordList(
+    IndexRecordList& record_list) {
   Slice key_slice;
   Slice key_prefix_slice;
   Slice key_suffix_slice;
   Slice value_slice;
   Slice prev_key_prefix_slice;
+  uint32_t prev_key_prefix_hash = 0;
   uint32_t pos = data_start_offset_;
   int key_index_within_prefix = 0;
   bool first = true;
@@ -156,72 +191,104 @@ Status PlainTableReader::PopulateIndex() {
   HistogramImpl keys_per_prefix_hist;
   // Need map to be ordered to make sure sub indexes generated
   // are in order.
-  std::vector<std::pair<Slice, std::string>> prefix_index_pairs;
-  std::string current_prefix_index;
+
+  int num_prefixes = 0;
+
   while (pos < data_end_offset_) {
     uint32_t key_offset = pos;
     status_ = Next(pos, &key_slice, &value_slice, pos);
-    key_prefix_slice = Slice(key_slice.data(), key_prefix_len_);
+    key_prefix_slice = GetPrefix(key_slice);
 
     if (first || prev_key_prefix_slice != key_prefix_slice) {
+      num_prefixes++;
       if (!first) {
         keys_per_prefix_hist.Add(key_index_within_prefix);
-        prefix_index_pairs.push_back(
-            std::make_pair<Slice, std::string>(
-                std::move(prev_key_prefix_slice),
-                std::move(current_prefix_index)));
-        current_prefix_index.clear();
       }
       key_index_within_prefix = 0;
       prev_key_prefix_slice = key_prefix_slice;
+      prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
     }
 
-    if (key_index_within_prefix++ % 8 == 0) {
-      // Add an index key for every 8 keys
-      PutFixed32(&current_prefix_index, key_offset);
+    if (key_index_within_prefix++ % 16 == 0) {
+      // Add an index key for every 16 keys
+      record_list.AddRecord(prev_key_prefix_hash, key_offset);
     }
     first = false;
   }
-  prefix_index_pairs.push_back(
-      std::make_pair<Slice, std::string>(std::move(prev_key_prefix_slice),
-                                         std::move(current_prefix_index)));
-
   keys_per_prefix_hist.Add(key_index_within_prefix);
+  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+      keys_per_prefix_hist.ToString().c_str());
+
+  return num_prefixes;
+}
+
+void PlainTableReader::Allocate(int num_prefixes) {
   if (hash_table_ != nullptr) {
     delete[] hash_table_;
   }
-  std::vector<Slice> filter_entries(0); // for creating bloom filter;
-  if (filter_policy_ != nullptr) {
-    filter_entries.reserve(prefix_index_pairs.size());
+  if (bloom_bits_per_key_ > 0) {
+    bloom_ = new DynamicBloom(num_prefixes * bloom_bits_per_key_);
   }
   double hash_table_size_multipier =
       (hash_table_ratio_ > 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
-  hash_table_size_ = prefix_index_pairs.size() * hash_table_size_multipier + 1;
+  hash_table_size_ = num_prefixes * hash_table_size_multipier + 1;
   hash_table_ = new uint32_t[hash_table_size_];
-  std::vector<std::string> hash2map(hash_table_size_);
+}
 
+size_t PlainTableReader::BucketizeIndexesAndFillBloom(
+    IndexRecordList& record_list, int num_prefixes,
+    std::vector<IndexRecord*>& hash2offsets,
+    std::vector<uint32_t>& bucket_count) {
   size_t sub_index_size_needed = 0;
-  for (auto& p: prefix_index_pairs) {
-    auto& sub_index = hash2map[getBucketId(p.first, key_prefix_len_,
-                                           hash_table_size_)];
-    if (sub_index.length() > 0 || p.second.length() > kOffsetLen) {
-      if (sub_index.length() <= kOffsetLen) {
-        sub_index_size_needed += sub_index.length() + 4;
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+      if (bloom_) {
+        bloom_->AddHash(cur_hash);
       }
-      sub_index_size_needed += p.second.length();
     }
-    sub_index.append(p.second);
-    if (filter_policy_ != nullptr) {
-      filter_entries.push_back(p.first);
+    uint32_t bucket = getBucketIdFromHash(cur_hash, hash_table_size_);
+    IndexRecord* prev_bucket_head = hash2offsets[bucket];
+    index_record->next = prev_bucket_head;
+    hash2offsets[bucket] = index_record;
+    if (bucket_count[bucket] > 0) {
+      if (bucket_count[bucket] == 1) {
+        sub_index_size_needed += kOffsetLen + 1;
+      }
+      if (bucket_count[bucket] == 127) {
+        // Need more than one byte for length
+        sub_index_size_needed++;
+      }
+      sub_index_size_needed += kOffsetLen;
     }
+    bucket_count[bucket]++;
   }
+  return sub_index_size_needed;
+}
 
-  sub_index_.clear();
+void PlainTableReader::FillIndexes(size_t sub_index_size_needed,
+                                   std::vector<IndexRecord*>& hash2offsets,
+                                   std::vector<uint32_t>& bucket_count) {
   Log(options_.info_log, "Reserving %zu bytes for sub index",
       sub_index_size_needed);
-  sub_index_.reserve(sub_index_size_needed);
+  // 4 bytes buffer for variable length size
+  size_t buffer_size = 64;
+  size_t buffer_used = 0;
+  sub_index_size_needed += buffer_size;
+  sub_index_ = new char[sub_index_size_needed];
+  size_t sub_index_offset = 0;
+  char* prev_ptr;
+  char* cur_ptr;
+  uint32_t* sub_index_ptr;
+  IndexRecord* record;
   for (int i = 0; i < hash_table_size_; i++) {
-    uint32_t num_keys_for_bucket = hash2map[i].length() / kOffsetLen;
+    uint32_t num_keys_for_bucket = bucket_count[i];
     switch (num_keys_for_bucket) {
     case 0:
       // No key for bucket
@@ -229,58 +296,131 @@ Status PlainTableReader::PopulateIndex() {
       break;
     case 1:
       // point directly to the file offset
-      hash_table_[i] = DecodeFixed32(hash2map[i].data());
+      hash_table_[i] = hash2offsets[i]->offset;
       break;
     default:
-      // point to index block
-      hash_table_[i] = sub_index_.length() | kSubIndexMask;
-      PutFixed32(&sub_index_, num_keys_for_bucket);
-      sub_index_.append(hash2map[i]);
+      // point to second level indexes.
+      hash_table_[i] = sub_index_offset | kSubIndexMask;
+      prev_ptr = sub_index_ + sub_index_offset;
+      cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+      sub_index_offset += cur_ptr - prev_ptr;
+      if (cur_ptr - prev_ptr > 2
+          || (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) {
+        // Need to resize sub_index. Exponentially grow buffer.
+        buffer_used += cur_ptr - prev_ptr - 1;
+        if (buffer_used + 4 > buffer_size) {
+          Log(options_.info_log, "Recalculate suffix_map length to %zu",
+              sub_index_size_needed);
+
+          sub_index_size_needed += buffer_size;
+          buffer_size *= 2;
+          char* new_sub_index = new char[sub_index_size_needed];
+          memcpy(new_sub_index, sub_index_, sub_index_offset);
+          delete[] sub_index_;
+          sub_index_ = new_sub_index;
+        }
+      }
+      sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset);
+      record = hash2offsets[i];
+      int j;
+      for (j = num_keys_for_bucket - 1;
+          j >= 0 && record; j--, record = record->next) {
+        sub_index_ptr[j] = record->offset;
+      }
+      assert(j == -1 && record == nullptr);
+      sub_index_offset += kOffsetLen * num_keys_for_bucket;
+      break;
     }
   }
-  if (filter_policy_ != nullptr) {
-    filter_str_.clear();
-    filter_policy_->CreateFilter(&filter_entries[0], filter_entries.size(),
-                                 &filter_str_);
-    filter_slice_ = Slice(filter_str_.data(), filter_str_.size());
-  }
 
   Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
-      hash_table_size_, sub_index_.length());
-  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
-      keys_per_prefix_hist.ToString().c_str());
+      hash_table_size_, sub_index_size_needed);
+}
+
+// PopulateIndex() builds index of keys.
+// hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
+// integer. The lower 31 bits contain an offset value (explained below) and
+// the first bit of the integer indicates type of the offset:
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+//   hashing this prefix), whose first row starts from this offset of the file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+//   are too many rows for one prefix so we need a binary search for it. In
+//   this case, the offset indicates the offset of sub_index_ holding the
+//   binary search indexes of keys for those rows. Those binary search indexes
+//   are organized in this way:
+//
+// The first 4 bytes, indicates how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file, which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending order
+// to make sure we can use them to do binary searches.
+Status PlainTableReader::PopulateIndex() {
+  // Get mmapped memory to file_data_.
+  Status s = file_->Read(0, file_size_, &file_data_, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  IndexRecordList record_list(256);
+  // First, read the whole file, for every 16 rows for a prefix (starting from
+  // the first one), generate a record of (hash, offset) and append it to
+  // IndexRecordList, which is a data structure created to store them.
+  int num_prefixes = PopulateIndexRecordList(record_list);
+  // Calculated hash table and bloom filter size and allocate memory for indexes
+  // and bloom filter based on the number of prefixes.
+  Allocate(num_prefixes);
+
+  // Bucketize all the index records to a temp data structure, in which for
+  // each bucket, we generate a linked list of IndexRecord, in reversed order.
+  std::vector<IndexRecord*> hash2offsets(hash_table_size_, nullptr);
+  std::vector<uint32_t> bucket_count(hash_table_size_, 0);
+  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(record_list,
+                                                              num_prefixes,
+                                                              hash2offsets,
+                                                              bucket_count);
+  // From the temp data structure, populate indexes.
+  FillIndexes(sub_index_size_needed, hash2offsets, bucket_count);
 
   return Status::OK();
 }
 
-uint32_t PlainTableReader::GetOffset(const Slice& target,
-                                     bool& prefix_matched) {
+Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
+                                   uint32_t prefix_hash, bool& prefix_matched,
+                                   uint32_t& ret_offset) {
   prefix_matched = false;
-  int bucket = getBucketId(target, key_prefix_len_, hash_table_size_);
+  int bucket = getBucketIdFromHash(prefix_hash, hash_table_size_);
   uint32_t bucket_value = hash_table_[bucket];
   if (bucket_value == data_end_offset_) {
-    return data_end_offset_;
+    ret_offset = data_end_offset_;
+    return Status::OK();
   } else if ((bucket_value & kSubIndexMask) == 0) {
     // point directly to the file
-    return bucket_value;
+    ret_offset = bucket_value;
+    return Status::OK();
   }
-  // point to sub-index, need to do a binary search
 
+  // point to sub-index, need to do a binary search
   uint32_t low = 0;
   uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
-  uint32_t upper_bound = DecodeFixed32(sub_index_.data() + prefix_index_offset);
+
+  const char* index_ptr = sub_index_ + prefix_index_offset;
+  uint32_t upper_bound;
+  const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr,
+                                                              index_ptr + 4,
+                                                              &upper_bound);
   uint32_t high = upper_bound;
-  uint64_t base_offset = prefix_index_offset + 4;
   Slice mid_key;
 
   // The key is between [low, high). Do a binary search between it.
   while (high - low > 1) {
     uint32_t mid = (high + low) / 2;
-    const char* index_offset = sub_index_.data() + base_offset
-        + kOffsetLen * mid;
-    uint32_t file_offset = DecodeFixed32(index_offset);
-    mid_key = Slice(file_data_.data() + file_offset, GetInternalKeyLength());
-
+    uint32_t file_offset = base_ptr[mid];
+    size_t tmp;
+    Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp);
+    if (!s.ok()) {
+      return s;
+    }
     int cmp_result = options_.comparator->Compare(target, mid_key);
     if (cmp_result > 0) {
       low = mid;
@@ -289,38 +429,61 @@ uint32_t PlainTableReader::GetOffset(const Slice& target,
         // Happen to have found the exact key or target is smaller than the
         // first key after base_offset.
         prefix_matched = true;
-        return file_offset;
+        ret_offset = file_offset;
+        return Status::OK();
       } else {
         high = mid;
       }
     }
   }
-
-  // The key is between low and low+1 (if exists). Both of them can have the
-  // correct prefix. Need to rule out at least one, to avoid to miss the
-  // correct one.
-  uint32_t low_key_offset = DecodeFixed32(
-      sub_index_.data() + base_offset + kOffsetLen * low);
-  if (low + 1 < upper_bound) {
-    if (Slice(file_data_.data() + low_key_offset, key_prefix_len_)
-        == Slice(target.data(), key_prefix_len_)) {
-      prefix_matched = true;
-    } else {
-      prefix_matched = false;
-      return DecodeFixed32(
-          sub_index_.data() + base_offset + kOffsetLen * (low + 1));
-    }
-  } else {
+  // Both of the key at the position low or low+1 could share the same
+  // prefix as target. We need to rule out one of them to avoid to go
+  // to the wrong prefix.
+  Slice low_key;
+  size_t tmp;
+  uint32_t low_key_offset = base_ptr[low];
+  Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp);
+  if (GetPrefix(low_key) == prefix) {
+    prefix_matched = true;
+    ret_offset = low_key_offset;
+  } else if (low + 1 < upper_bound) {
+    // There is possible a next prefix, return it
     prefix_matched = false;
+    ret_offset = base_ptr[low + 1];
+  } else {
+    // target is larger than a key of the last prefix in this bucket
+    // but with a different prefix. Key does not exist.
+    ret_offset = data_end_offset_;
   }
-  return low_key_offset;
+  return Status::OK();
 }
 
-bool PlainTableReader::MayHavePrefix(const Slice& target_prefix) {
-  return filter_policy_ == nullptr
-      || filter_policy_->KeyMayMatch(target_prefix, filter_slice_);
+bool PlainTableReader::MayHavePrefix(uint32_t hash) {
+  return bloom_ == nullptr || bloom_->MayContainHash(hash);
 }
 
+Status PlainTableReader::ReadKey(const char* row_ptr, Slice* key,
+                                 size_t& bytes_read) {
+  const char* key_ptr;
+  bytes_read = 0;
+  size_t internal_key_size;
+  if (IsFixedLength()) {
+    internal_key_size = GetFixedInternalKeyLength();
+    key_ptr = row_ptr;
+  } else {
+    uint32_t key_size;
+    key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_,
+                             &key_size);
+    internal_key_size = (size_t) key_size;
+    bytes_read = key_ptr - row_ptr;
+  }
+  if (row_ptr + internal_key_size >= file_data_.data() + data_end_offset_) {
+    return Status::Corruption("Unable to read the next key");
+  }
+  *key = Slice(key_ptr, internal_key_size);
+  bytes_read += internal_key_size;
+  return Status::OK();
+}
 
 Status PlainTableReader::Next(uint32_t offset, Slice* key, Slice* value,
                               uint32_t& next_offset) {
@@ -333,22 +496,17 @@ Status PlainTableReader::Next(uint32_t offset, Slice* key, Slice* value,
     return Status::Corruption("Offset is out of file size");
   }
 
-  int internal_key_size = GetInternalKeyLength();
-  if (offset + internal_key_size >= data_end_offset_) {
-    return Status::Corruption("Un able to read the next key");
-  }
-
-  const char* key_ptr =  file_data_.data() + offset;
-  *key = Slice(key_ptr, internal_key_size);
-
+  const char* row_ptr = file_data_.data() + offset;
+  size_t bytes_for_key;
+  Status s = ReadKey(row_ptr, key, bytes_for_key);
   uint32_t value_size;
-  const char* value_ptr = GetVarint32Ptr(key_ptr + internal_key_size,
+  const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key,
                                          file_data_.data() + data_end_offset_,
                                          &value_size);
   if (value_ptr == nullptr) {
     return Status::Corruption("Error reading value length.");
   }
-  next_offset = offset + (value_ptr - key_ptr) + value_size;
+  next_offset = offset + (value_ptr - row_ptr) + value_size;
   if (next_offset > data_end_offset_) {
     return Status::Corruption("Reach end of file when reading value");
   }
@@ -362,13 +520,17 @@ Status PlainTableReader::Get(
     bool (*saver)(void*, const Slice&, const Slice&, bool),
     void (*mark_key_may_exist)(void*)) {
   // Check bloom filter first.
-  if (!MayHavePrefix(Slice(target.data(), key_prefix_len_))) {
+  Slice prefix_slice = GetPrefix(target);
+  uint32_t prefix_hash = GetSliceHash(prefix_slice);
+  if (!MayHavePrefix(prefix_hash)) {
     return Status::OK();
   }
-
   uint32_t offset;
   bool prefix_match;
-  offset = GetOffset(target, prefix_match);
+  Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset);
+  if (!s.ok()) {
+    return s;
+  }
   Slice found_key;
   Slice found_value;
   while (offset < data_end_offset_) {
@@ -379,8 +541,8 @@ Status PlainTableReader::Get(
     if (!prefix_match) {
       // Need to verify prefix for the first key found if it is not yet
       // checked.
-      if (!target.starts_with(Slice(found_key.data(), key_prefix_len_))) {
-        break;
+      if (GetPrefix(found_key) != prefix_slice) {
+        return Status::OK();
       }
       prefix_match = true;
     }
@@ -403,7 +565,7 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
 
 PlainTableIterator::PlainTableIterator(PlainTableReader* table) :
     table_(table) {
-  SeekToFirst();
+  next_offset_ = offset_ = table_->data_end_offset_;
 }
 
 PlainTableIterator::~PlainTableIterator() {
@@ -416,7 +578,11 @@ bool PlainTableIterator::Valid() const {
 
 void PlainTableIterator::SeekToFirst() {
   next_offset_ = table_->data_start_offset_;
-  Next();
+  if (next_offset_ >= table_->data_end_offset_) {
+    next_offset_ = offset_ = table_->data_end_offset_;
+  } else {
+    Next();
+  }
 }
 
 void PlainTableIterator::SeekToLast() {
@@ -424,18 +590,25 @@ void PlainTableIterator::SeekToLast() {
 }
 
 void PlainTableIterator::Seek(const Slice& target) {
-  if (!table_->MayHavePrefix(Slice(target.data(), table_->key_prefix_len_))) {
+  Slice prefix_slice =  table_->GetPrefix(target);
+  uint32_t prefix_hash = GetSliceHash(prefix_slice);
+  if (!table_->MayHavePrefix(prefix_hash)) {
     offset_ = next_offset_ = table_->data_end_offset_;
     return;
   }
   bool prefix_match;
-  next_offset_ = table_->GetOffset(target, prefix_match);
+  status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
+                              next_offset_);
+  if (!status_.ok()) {
+    offset_ = next_offset_ = table_->data_end_offset_;
+    return;
+  }
 
   if (next_offset_ < table_-> data_end_offset_) {
     for (Next(); status_.ok() && Valid(); Next()) {
       if (!prefix_match) {
         // Need to verify the first key's prefix
-        if (!target.starts_with(Slice(key().data(), table_->key_prefix_len_))) {
+        if (table_->GetPrefix(key()) != prefix_slice) {
           offset_ = next_offset_ = table_->data_end_offset_;
           break;
         }
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 26a506d14..6d2efc7da 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -9,6 +9,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "rocksdb/plain_table_factory.h"
 
 namespace rocksdb {
 
@@ -20,33 +21,12 @@ class RandomAccessFile;
 struct ReadOptions;
 class TableCache;
 class TableReader;
+class DynamicBloom;
 
 using std::unique_ptr;
 using std::unordered_map;
 
-// Based on following output file format:
-// +-------------+
-// | version     |
-// +-------------+------------------------------+  <= key1_data_offset
-// | key1            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value1                                     |
-// |                                            |
-// +----------------------------------------+---+  <= key2_data_offset
-// | key2            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value2                                     |
-// |                                            |
-// |        ......                              |
-// +-----------------+--------------------------+   <= index_block_offset
-// | key1            | key1 offset (8 bytes)    |
-// +-----------------+--------------------------+   <= key2_index_offset
-// | key2            | key2 offset (8 bytes)    |
-// +-----------------+--------------------------+   <= key3_index_offset
-// | key3            | key3 offset (8 bytes)    |
-// +-----------------+--------------------------+   <= key4_index_offset
-// |        ......                              |
-// +-----------------+------------+-------------+
+// Based on following output file format shown in plain_table_factory.h
 // When opening the output file, IndexedTableReader creates a hash table
 // from key prefixes to offset of the output file. IndexedTable will decide
 // whether it points to the data offset of the first key with the key prefix
@@ -58,8 +38,7 @@ class PlainTableReader: public TableReader {
 public:
   static Status Open(const Options& options, const EnvOptions& soptions,
                      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
-                     unique_ptr<TableReader>* table, const int user_key_size,
-                     const int key_prefix_len, const int bloom_num_bits,
+                     unique_ptr<TableReader>* table, const int bloom_num_bits,
                      double hash_table_ratio);
 
   bool PrefixMayMatch(const Slice& internal_prefix);
@@ -81,20 +60,18 @@ public:
     return table_properties_;
   }
 
-  PlainTableReader(
-      const EnvOptions& storage_options,
-      uint64_t file_size,
-      int user_key_size,
-      int key_prefix_len,
-      int bloom_num_bits,
-      double hash_table_ratio,
-      const TableProperties& table_properties);
+  PlainTableReader(const EnvOptions& storage_options, uint64_t file_size,
+                   int bloom_num_bits, double hash_table_ratio,
+                   const TableProperties& table_properties);
   ~PlainTableReader();
 
 private:
+  struct IndexRecord;
+  class IndexRecordList;
+
   uint32_t* hash_table_ = nullptr;
   int hash_table_size_;
-  std::string sub_index_;
+  char* sub_index_ = nullptr;
 
   Options options_;
   const EnvOptions& soptions_;
@@ -104,37 +81,67 @@ private:
   Slice file_data_;
   uint32_t version_;
   uint32_t file_size_;
-  const size_t user_key_size_;
-  const size_t key_prefix_len_;
+
   const double hash_table_ratio_;
-  const FilterPolicy* filter_policy_;
-  std::string filter_str_;
-  Slice filter_slice_;
+  const int bloom_bits_per_key_;
+  DynamicBloom* bloom_;
 
   TableProperties table_properties_;
-  uint32_t data_start_offset_;
-  uint32_t data_end_offset_;
+  const uint32_t data_start_offset_;
+  const uint32_t data_end_offset_;
+  const size_t user_key_len_;
 
   static const size_t kNumInternalBytes = 8;
   static const uint32_t kSubIndexMask = 0x80000000;
   static const size_t kOffsetLen = sizeof(uint32_t);
 
-  inline size_t GetInternalKeyLength() {
-    return user_key_size_ + kNumInternalBytes;
+  bool IsFixedLength() {
+    return user_key_len_ != PlainTableFactory::kVariableLength;
+  }
+
+  size_t GetFixedInternalKeyLength() {
+    return user_key_len_ + kNumInternalBytes;
   }
 
   friend class TableCache;
   friend class PlainTableIterator;
 
+  // Internal helper function to generate an IndexRecordList object from all
+  // the rows, which contains index records as a list.
+  int PopulateIndexRecordList(IndexRecordList& record_list);
+
+  // Internal helper function to allocate memory for indexes and bloom filters
+  void Allocate(int num_prefixes);
+
+  // Internal helper function to bucket index record list to hash buckets.
+  // hash2offsets is sized of of hash_table_size_, each contains a linked list
+  // of offsets for the hash, in reversed order.
+  // bucket_count is sized of hash_table_size_. The value is how many index
+  // records are there in hash2offsets for the same bucket.
+  size_t BucketizeIndexesAndFillBloom(
+      IndexRecordList& record_list, int num_prefixes,
+      std::vector<IndexRecord*>& hash2offsets,
+      std::vector<uint32_t>& bucket_count);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures. hash2offsets and bucket_count are bucketized indexes and
+  // counts generated by BucketizeIndexesAndFillBloom().
+  void FillIndexes(size_t sub_index_size_needed,
+                   std::vector<IndexRecord*>& hash2offsets,
+                   std::vector<uint32_t>& bucket_count);
+
   // Populate the internal indexes. It must be called before
   // any query to the table.
   // This query will populate the hash table hash_table_, the second
   // level of indexes sub_index_ and bloom filter filter_slice_ if enabled.
   Status PopulateIndex();
 
-  // Check bloom filter to see whether it might contain this prefix
-  bool MayHavePrefix(const Slice& target_prefix);
+  // Check bloom filter to see whether it might contain this prefix.
+  // The hash of the prefix is given, since it can be reused for index lookup
+  // too.
+  bool MayHavePrefix(uint32_t hash);
 
+  Status ReadKey(const char* row_ptr, Slice* key, size_t& bytes_read);
   // Read the key and value at offset to key and value.
   // tmp_slice is a tmp slice.
   // return next_offset as the offset for the next key.
@@ -142,7 +149,15 @@ private:
   // Get file offset for key target.
   // return value prefix_matched is set to true if the offset is confirmed
   // for a key with the same prefix as target.
-  uint32_t GetOffset(const Slice& target, bool& prefix_matched);
+  Status GetOffset(const Slice& target, const Slice& prefix,
+                   uint32_t prefix_hash, bool& prefix_matched,
+                   uint32_t& ret_offset);
+
+  Slice GetPrefix(const Slice& target) {
+    assert(target.size() >= 8); // target is internal key
+    return options_.prefix_extractor->Transform(
+        Slice(target.data(), target.size() - 8));
+  }
 
   // No copying allowed
   explicit PlainTableReader(const TableReader&) = delete;
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 47e7f8b33..414b15681 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -104,6 +104,10 @@ const std::string TablePropertiesNames::kNumEntries =
     "rocksdb.num.entries";
 const std::string TablePropertiesNames::kFilterPolicy =
     "rocksdb.filter.policy";
+const std::string TablePropertiesNames::kFormatVersion =
+    "rocksdb.format.version";
+const std::string TablePropertiesNames::kFixedKeyLen =
+    "rocksdb.fixed.key.length";
 
 extern const std::string kPropertiesBlock = "rocksdb.properties";
 
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 7e7e6b7da..a491d168f 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -8,7 +8,6 @@
 #include "rocksdb/db.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "rocksdb/slice_transform.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "port/atomic_pointer.h"
@@ -242,9 +241,10 @@ int main(int argc, char** argv) {
   if (FLAGS_plain_table) {
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;
-    tf = new rocksdb::PlainTableFactory(16, FLAGS_prefix_len,
-                                        (FLAGS_prefix_len == 16) ? 0 : 8,
+    tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8,
                                         0.75);
+    options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
+        FLAGS_prefix_len);
   } else {
     tf = new rocksdb::BlockBasedTableFactory();
   }
diff --git a/table/table_test.cc b/table/table_test.cc
index 7711ed8ad..bff8ee529 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -22,8 +22,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/memtablerep.h"
-
 #include "table/meta_blocks.h"
+#include "rocksdb/plain_table_factory.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
@@ -124,8 +124,9 @@ class StringSink: public WritableFile {
 
 class StringSource: public RandomAccessFile {
  public:
-  StringSource(const Slice& contents, uint64_t uniq_id)
-      : contents_(contents.data(), contents.size()), uniq_id_(uniq_id) {
+  StringSource(const Slice& contents, uint64_t uniq_id, bool mmap)
+      : contents_(contents.data(), contents.size()), uniq_id_(uniq_id),
+        mmap_(mmap) {
   }
 
   virtual ~StringSource() { }
@@ -140,8 +141,12 @@ class StringSource: public RandomAccessFile {
     if (offset + n > contents_.size()) {
       n = contents_.size() - offset;
     }
-    memcpy(scratch, &contents_[offset], n);
-    *result = Slice(scratch, n);
+    if (!mmap_) {
+      memcpy(scratch, &contents_[offset], n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&contents_[offset], n);
+    }
     return Status::OK();
   }
 
@@ -159,6 +164,7 @@ class StringSource: public RandomAccessFile {
  private:
   std::string contents_;
   uint64_t uniq_id_;
+  bool mmap_;
 };
 
 typedef std::map<std::string, std::string, anon::STLLessThan> KVMap;
@@ -245,42 +251,88 @@ class BlockConstructor: public Constructor {
   BlockConstructor();
 };
 
-class BlockBasedTableConstructor: public Constructor {
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator: public Iterator {
  public:
-  explicit BlockBasedTableConstructor(
-      const Comparator* cmp)
-      : Constructor(cmp) {
+  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
+  virtual ~KeyConvertingIterator() { delete iter_; }
+  virtual bool Valid() const { return iter_->Valid(); }
+  virtual void Seek(const Slice& target) {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->Seek(encoded);
+  }
+  virtual void SeekToFirst() { iter_->SeekToFirst(); }
+  virtual void SeekToLast() { iter_->SeekToLast(); }
+  virtual void Next() { iter_->Next(); }
+  virtual void Prev() { iter_->Prev(); }
+
+  virtual Slice key() const {
+    assert(Valid());
+    ParsedInternalKey key;
+    if (!ParseInternalKey(iter_->key(), &key)) {
+      status_ = Status::Corruption("malformed internal key");
+      return Slice("corrupted key");
+    }
+    return key.user_key;
   }
-  ~BlockBasedTableConstructor() {
+
+  virtual Slice value() const { return iter_->value(); }
+  virtual Status status() const {
+    return status_.ok() ? iter_->status() : status_;
+  }
+
+ private:
+  mutable Status status_;
+  Iterator* iter_;
+
+  // No copying allowed
+  KeyConvertingIterator(const KeyConvertingIterator&);
+  void operator=(const KeyConvertingIterator&);
+};
+
+class TableConstructor: public Constructor {
+ public:
+  explicit TableConstructor(
+      const Comparator* cmp, bool convert_to_internal_key = false)
+      : Constructor(cmp),
+        convert_to_internal_key_(convert_to_internal_key)  {
+  }
+  ~TableConstructor() {
     Reset();
   }
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
     Reset();
     sink_.reset(new StringSink());
-    std::unique_ptr<FlushBlockBySizePolicyFactory> flush_policy_factory(
-        new FlushBlockBySizePolicyFactory(options.block_size,
-                                          options.block_size_deviation));
-
-    BlockBasedTableBuilder builder(
-        options,
-        sink_.get(),
-        flush_policy_factory.get(),
-        options.compression);
+    unique_ptr<TableBuilder> builder;
+    builder.reset(
+        options.table_factory->GetTableBuilder(options, sink_.get(),
+                                               options.compression));
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
          ++it) {
-      builder.Add(it->first, it->second);
-      ASSERT_TRUE(builder.status().ok());
+      if (convert_to_internal_key_) {
+        ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue);
+        std::string encoded;
+        AppendInternalKey(&encoded, ikey);
+        builder->Add(encoded, it->second);
+      } else {
+        builder->Add(it->first, it->second);
+      }
+      ASSERT_TRUE(builder->status().ok());
     }
-    Status s = builder.Finish();
+    Status s = builder->Finish();
     ASSERT_TRUE(s.ok()) << s.ToString();
 
-    ASSERT_EQ(sink_->contents().size(), builder.FileSize());
+    ASSERT_EQ(sink_->contents().size(), builder->FileSize());
 
     // Open the table
     uniq_id_ = cur_uniq_id_++;
-    source_.reset(new StringSource(sink_->contents(), uniq_id_));
+    source_.reset(
+        new StringSource(sink_->contents(), uniq_id_,
+                         options.allow_mmap_reads));
     unique_ptr<TableFactory> table_factory;
     return options.table_factory->GetTableReader(options, soptions,
                                                  std::move(source_),
@@ -289,7 +341,12 @@ class BlockBasedTableConstructor: public Constructor {
   }
 
   virtual Iterator* NewIterator() const {
-    return table_reader_->NewIterator(ReadOptions());
+    Iterator* iter = table_reader_->NewIterator(ReadOptions());
+    if (convert_to_internal_key_) {
+      return new KeyConvertingIterator(iter);
+    } else {
+      return iter;
+    }
   }
 
   uint64_t ApproximateOffsetOf(const Slice& key) const {
@@ -297,7 +354,9 @@ class BlockBasedTableConstructor: public Constructor {
   }
 
   virtual Status Reopen(const Options& options) {
-    source_.reset(new StringSource(sink_->contents(), uniq_id_));
+    source_.reset(
+        new StringSource(sink_->contents(), uniq_id_,
+                         options.allow_mmap_reads));
     return options.table_factory->GetTableReader(options, soptions,
                                                  std::move(source_),
                                                  sink_->contents().size(),
@@ -315,59 +374,19 @@ class BlockBasedTableConstructor: public Constructor {
     sink_.reset();
     source_.reset();
   }
+  bool convert_to_internal_key_;
 
   uint64_t uniq_id_;
   unique_ptr<StringSink> sink_;
   unique_ptr<StringSource> source_;
   unique_ptr<TableReader> table_reader_;
 
-  BlockBasedTableConstructor();
+  TableConstructor();
 
   static uint64_t cur_uniq_id_;
   const EnvOptions soptions;
 };
-uint64_t BlockBasedTableConstructor::cur_uniq_id_ = 1;
-
-// A helper class that converts internal format keys into user keys
-class KeyConvertingIterator: public Iterator {
- public:
-  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
-  virtual ~KeyConvertingIterator() { delete iter_; }
-  virtual bool Valid() const { return iter_->Valid(); }
-  virtual void Seek(const Slice& target) {
-    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
-    std::string encoded;
-    AppendInternalKey(&encoded, ikey);
-    iter_->Seek(encoded);
-  }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void SeekToLast() { iter_->SeekToLast(); }
-  virtual void Next() { iter_->Next(); }
-  virtual void Prev() { iter_->Prev(); }
-
-  virtual Slice key() const {
-    assert(Valid());
-    ParsedInternalKey key;
-    if (!ParseInternalKey(iter_->key(), &key)) {
-      status_ = Status::Corruption("malformed internal key");
-      return Slice("corrupted key");
-    }
-    return key.user_key;
-  }
-
-  virtual Slice value() const { return iter_->value(); }
-  virtual Status status() const {
-    return status_.ok() ? iter_->status() : status_;
-  }
-
- private:
-  mutable Status status_;
-  Iterator* iter_;
-
-  // No copying allowed
-  KeyConvertingIterator(const KeyConvertingIterator&);
-  void operator=(const KeyConvertingIterator&);
-};
+uint64_t TableConstructor::cur_uniq_id_ = 1;
 
 class MemTableConstructor: public Constructor {
  public:
@@ -481,7 +500,9 @@ static bool BZip2CompressionSupported() {
 #endif
 
 enum TestType {
-  TABLE_TEST,
+  BLOCK_BASED_TABLE_TEST,
+  PLAIN_TABLE_SEMI_FIXED_PREFIX,
+  PLAIN_TABLE_FULL_STR_PREFIX,
   BLOCK_TEST,
   MEMTABLE_TEST,
   DB_TEST
@@ -497,8 +518,10 @@ struct TestArgs {
 
 static std::vector<TestArgs> GenerateArgList() {
   std::vector<TestArgs> ret;
-  TestType test_type[4] = {TABLE_TEST, BLOCK_TEST, MEMTABLE_TEST, DB_TEST};
-  int test_type_len = 4;
+  TestType test_type[6] = { BLOCK_BASED_TABLE_TEST,
+      PLAIN_TABLE_SEMI_FIXED_PREFIX, PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST,
+      MEMTABLE_TEST, DB_TEST };
+  int test_type_len = 6;
   bool reverse_compare[2] = {false, true};
   int reverse_compare_len = 2;
   int restart_interval[3] = {16, 1, 1024};
@@ -523,20 +546,66 @@ static std::vector<TestArgs> GenerateArgList() {
 #endif
 
   for(int i =0; i < test_type_len; i++)
-    for (int j =0; j < reverse_compare_len; j++)
-      for (int k =0; k < restart_interval_len; k++)
-  for (unsigned int n =0; n < compression_types.size(); n++) {
-    TestArgs one_arg;
-    one_arg.type = test_type[i];
-    one_arg.reverse_compare = reverse_compare[j];
-    one_arg.restart_interval = restart_interval[k];
-    one_arg.compression = compression_types[n];
-    ret.push_back(one_arg);
-  }
+    for (int j =0; j < reverse_compare_len; j++) {
+      if (test_type[i] == PLAIN_TABLE_SEMI_FIXED_PREFIX
+          || test_type[i] == PLAIN_TABLE_FULL_STR_PREFIX) {
+        // Plain table doesn't use restart index or compression.
+        TestArgs one_arg;
+        one_arg.type = test_type[i];
+        one_arg.reverse_compare = reverse_compare[0];
+        one_arg.restart_interval = restart_interval[0];
+        one_arg.compression = compression_types[0];
+        ret.push_back(one_arg);
+        continue;
+      }
 
+      for (int k = 0; k < restart_interval_len; k++)
+        for (unsigned int n = 0; n < compression_types.size(); n++) {
+          TestArgs one_arg;
+          one_arg.type = test_type[i];
+          one_arg.reverse_compare = reverse_compare[j];
+          one_arg.restart_interval = restart_interval[k];
+          one_arg.compression = compression_types[n];
+          ret.push_back(one_arg);
+        }
+    }
   return ret;
 }
 
+// In order to make all tests run for plain table format, including
+// those operating on empty keys, create a new prefix transformer which
+// return fixed prefix if the slice is not shorter than the prefix length,
+// and the full slice if it is shorter.
+class FixedOrLessPrefixTransform : public SliceTransform {
+ private:
+  const size_t prefix_len_;
+
+ public:
+  explicit FixedOrLessPrefixTransform(size_t prefix_len) :
+      prefix_len_(prefix_len) {
+  }
+
+  virtual const char* Name() const {
+    return "rocksdb.FixedPrefix";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    assert(InDomain(src));
+    if (src.size() < prefix_len_) {
+      return src;
+    }
+    return Slice(src.data(), prefix_len_);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return true;
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return (dst.size() <= prefix_len_);
+  }
+};
+
 class Harness {
  public:
   Harness() : constructor_(nullptr) { }
@@ -554,9 +623,35 @@ class Harness {
     if (args.reverse_compare) {
       options_.comparator = &reverse_key_comparator;
     }
+    internal_comparator_.reset(new InternalKeyComparator(options_.comparator));
+    support_prev_ = true;
+    only_support_prefix_seek_ = false;
+    BlockBasedTableFactory::TableOptions table_options;
     switch (args.type) {
-      case TABLE_TEST:
-        constructor_ = new BlockBasedTableConstructor(options_.comparator);
+      case BLOCK_BASED_TABLE_TEST:
+        table_options.flush_block_policy_factory.reset(
+            new FlushBlockBySizePolicyFactory(options_.block_size,
+                                              options_.block_size_deviation));
+        options_.table_factory.reset(new BlockBasedTableFactory(table_options));
+        constructor_ = new TableConstructor(options_.comparator);
+        break;
+      case PLAIN_TABLE_SEMI_FIXED_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor = new FixedOrLessPrefixTransform(2);
+        options_.allow_mmap_reads = true;
+        options_.table_factory.reset(new PlainTableFactory());
+        constructor_ = new TableConstructor(options_.comparator, true);
+        options_.comparator = internal_comparator_.get();
+        break;
+      case PLAIN_TABLE_FULL_STR_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor = NewNoopTransform();
+        options_.allow_mmap_reads = true;
+        options_.table_factory.reset(new PlainTableFactory());
+        constructor_ = new TableConstructor(options_.comparator, true);
+        options_.comparator = internal_comparator_.get();
         break;
       case BLOCK_TEST:
         constructor_ = new BlockConstructor(options_.comparator);
@@ -584,7 +679,9 @@ class Harness {
     constructor_->Finish(options_, &keys, &data);
 
     TestForwardScan(keys, data);
-    TestBackwardScan(keys, data);
+    if (support_prev_) {
+      TestBackwardScan(keys, data);
+    }
     TestRandomAccess(rnd, keys, data);
   }
 
@@ -627,7 +724,7 @@ class Harness {
     KVMap::const_iterator model_iter = data.begin();
     if (kVerbose) fprintf(stderr, "---\n");
     for (int i = 0; i < 200; i++) {
-      const int toss = rnd->Uniform(5);
+      const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
       switch (toss) {
         case 0: {
           if (iter->Valid()) {
@@ -719,17 +816,20 @@ class Harness {
     } else {
       const int index = rnd->Uniform(keys.size());
       std::string result = keys[index];
-      switch (rnd->Uniform(3)) {
+      switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
         case 0:
           // Return an existing key
           break;
         case 1: {
           // Attempt to return something smaller than an existing key
-          if (result.size() > 0 && result[result.size()-1] > '\0') {
-            result[result.size()-1]--;
+          if (result.size() > 0 && result[result.size() - 1] > '\0'
+              && (!only_support_prefix_seek_
+                  || options_.prefix_extractor->Transform(result).size()
+                  < result.size())) {
+            result[result.size() - 1]--;
           }
           break;
-        }
+      }
         case 2: {
           // Return something larger than an existing key
           Increment(options_.comparator, &result);
@@ -746,6 +846,9 @@ class Harness {
  private:
   Options options_ = Options();
   Constructor* constructor_;
+  bool support_prev_;
+  bool only_support_prefix_seek_;
+  shared_ptr<Comparator> internal_comparator_;
 };
 
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
@@ -763,8 +866,8 @@ class TableTest { };
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
-TEST(TableTest, BasicBlockedBasedTableProperties) {
-  BlockBasedTableConstructor c(BytewiseComparator());
+TEST(TableTest, BasicTableProperties) {
+  TableConstructor c(BytewiseComparator());
 
   c.Add("a1", "val1");
   c.Add("b2", "val2");
@@ -824,7 +927,7 @@ TEST(TableTest, BasicPlainTableProperties) {
   }
   ASSERT_OK(builder->Finish());
 
-  StringSource source(sink.contents(), 72242);
+  StringSource source(sink.contents(), 72242, true);
 
   TableProperties props;
   auto s = ReadTableProperties(
@@ -849,7 +952,7 @@ TEST(TableTest, BasicPlainTableProperties) {
 }
 
 TEST(TableTest, FilterPolicyNameProperties) {
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   c.Add("a1", "val1");
   std::vector<std::string> keys;
   KVMap kvmap;
@@ -889,7 +992,7 @@ TEST(TableTest, IndexSizeStat) {
   // Each time we load one more key to the table. the table index block
   // size is expected to be larger than last time's.
   for (size_t i = 1; i < keys.size(); ++i) {
-    BlockBasedTableConstructor c(BytewiseComparator());
+    TableConstructor c(BytewiseComparator());
     for (size_t j = 0; j < i; ++j) {
       c.Add(keys[j], "val");
     }
@@ -910,7 +1013,7 @@ TEST(TableTest, IndexSizeStat) {
 
 TEST(TableTest, NumBlockStat) {
   Random rnd(test::RandomSeed());
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   Options options;
   options.compression = kNoCompression;
   options.block_restart_interval = 1;
@@ -986,7 +1089,7 @@ TEST(TableTest, BlockCacheTest) {
   std::vector<std::string> keys;
   KVMap kvmap;
 
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   c.Add("key", "value");
   c.Finish(options, &keys, &kvmap);
 
@@ -1107,7 +1210,7 @@ TEST(TableTest, BlockCacheTest) {
 }
 
 TEST(TableTest, ApproximateOffsetOfPlain) {
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");
   c.Add("k02", "hello2");
   c.Add("k03", std::string(10000, 'x'));
@@ -1138,7 +1241,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
 
 static void Do_Compression_Test(CompressionType comp) {
   Random rnd(301);
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   std::string tmp;
   c.Add("k01", "hello");
   c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
@@ -1156,7 +1259,7 @@ static void Do_Compression_Test(CompressionType comp) {
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6100));
 }
 
 TEST(TableTest, ApproximateOffsetOfCompressed) {
@@ -1194,7 +1297,7 @@ TEST(TableTest, BlockCacheLeak) {
   opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever
                                                // lose cached values.
 
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");
   c.Add("k02", "hello2");
   c.Add("k03", std::string(10000, 'x'));
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index be47ab55a..84f964d9e 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -39,7 +39,10 @@ DynamicBloom::DynamicBloom(uint32_t total_bits,
 }
 
 void DynamicBloom::Add(const Slice& key) {
-  uint32_t h = hash_func_(key);
+  AddHash(hash_func_(key));
+}
+
+void DynamicBloom::AddHash(uint32_t h) {
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   for (uint32_t i = 0; i < num_probes_; i++) {
     const uint32_t bitpos = h % total_bits_;
@@ -49,7 +52,10 @@ void DynamicBloom::Add(const Slice& key) {
 }
 
 bool DynamicBloom::MayContain(const Slice& key) {
-  uint32_t h = hash_func_(key);
+  return (MayContainHash(hash_func_(key)));
+}
+
+bool DynamicBloom::MayContainHash(uint32_t h) {
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   for (uint32_t i = 0; i < num_probes_; i++) {
     const uint32_t bitpos = h % total_bits_;
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index c496e2ce7..aa29a4ae7 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -28,9 +28,14 @@ class DynamicBloom {
   // Assuming single threaded access to Add
   void Add(const Slice& key);
 
+  // Assuming single threaded access to Add
+  void AddHash(uint32_t hash);
+
   // Multithreaded access to MayContain is OK
   bool MayContain(const Slice& key);
 
+  // Multithreaded access to MayContain is OK
+  bool MayContainHash(uint32_t hash);
 
  private:
   uint32_t (*hash_func_)(const Slice& key);

From e94eea4527f2d7de82a6bf3303177977011e5dd9 Mon Sep 17 00:00:00 2001
From: Haobo Xu <haobo@fb.com>
Date: Fri, 20 Dec 2013 14:18:28 -0800
Subject: [PATCH 40/70] [RocksDB] [Performance Branch] Minor fix, Remove string
 resize from WriteBatch::Clear

Summary: tmp_batch_ will get re-allocated for every merged write batch because of the existing resize in WriteBatch::Clear. Note that in DBImpl::BuildBatchGroup, we have a hard coded upper limit of batch size 1<<20 = 1MB already.

Test Plan: make check

Reviewers: dhruba, sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14787
---
 db/write_batch.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/write_batch.cc b/db/write_batch.cc
index c04930bbf..fa01e82db 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -58,7 +58,6 @@ bool WriteBatch::Handler::Continue() {
 
 void WriteBatch::Clear() {
   rep_.clear();
-  rep_.resize(kHeader);
 }
 
 int WriteBatch::Count() const {

From bf4a48ccb356cf5ed205a30201e751218da7cfb0 Mon Sep 17 00:00:00 2001
From: Haobo Xu <haobo@fb.com>
Date: Fri, 20 Dec 2013 18:14:17 -0800
Subject: [PATCH 41/70] [RocksDB] [Performance Branch] Revert previous patch.

Summary: The previous patch is wrong. rep_.resize(kHeader) just resets the header portion to zero, and should not cause a re-allocation if g++ does it right. I will go ahead and revert it.

Test Plan: make check

Reviewers: dhruba, sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14793
---
 db/write_batch.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/write_batch.cc b/db/write_batch.cc
index fa01e82db..c04930bbf 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -58,6 +58,7 @@ bool WriteBatch::Handler::Continue() {
 
 void WriteBatch::Clear() {
   rep_.clear();
+  rep_.resize(kHeader);
 }
 
 int WriteBatch::Count() const {

From 476416c27c0cdd5791e33a69786857ad6a5e9789 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Mon, 30 Dec 2013 18:33:57 -0800
Subject: [PATCH 42/70] Some minor refactoring on the code

Summary: I made some cleanup while reading the source code in `db`. Most changes are about style, naming or C++ 11 new features.

Test Plan: ran `make check`

Reviewers: haobo, dhruba, sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15009
---
 db/db_impl.cc      | 232 ++++++++++++++++++++++++---------------------
 db/db_impl.h       |   7 +-
 db/log_format.h    |   1 -
 db/version_edit.cc |   8 +-
 db/version_edit.h  |  10 +-
 db/version_set.cc  | 100 +++++++++----------
 db/version_set.h   |   6 +-
 7 files changed, 190 insertions(+), 174 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index d3effae86..d63f145b3 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -20,8 +20,8 @@
 #include <vector>
 
 #include "db/builder.h"
-#include "db/dbformat.h"
 #include "db/db_iter.h"
+#include "db/dbformat.h"
 #include "db/filename.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -43,7 +43,6 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "port/port.h"
 #include "table/block.h"
 #include "table/block_based_table_factory.h"
 #include "table/merger.h"
@@ -59,7 +58,7 @@
 
 namespace rocksdb {
 
-void dumpLeveldbBuildVersion(Logger * log);
+void DumpLeveldbBuildVersion(Logger * log);
 
 // Information kept for every waiting writer
 struct DBImpl::Writer {
@@ -266,9 +265,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       storage_options_(options),
       bg_work_gate_closed_(false),
       refitting_level_(false) {
-
   mem_->Ref();
-
   env_->GetAbsolutePath(dbname, &db_absolute_path_);
 
   stall_leveln_slowdown_.resize(options.num_levels);
@@ -282,16 +279,15 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
   const int table_cache_size = options_.max_open_files - 10;
   table_cache_.reset(new TableCache(dbname_, &options_,
                                     storage_options_, table_cache_size));
-
   versions_.reset(new VersionSet(dbname_, &options_, storage_options_,
                                  table_cache_.get(), &internal_comparator_));
 
-  dumpLeveldbBuildVersion(options_.info_log.get());
+  DumpLeveldbBuildVersion(options_.info_log.get());
   options_.Dump(options_.info_log.get());
 
   char name[100];
-  Status st = env_->GetHostName(name, 100L);
-  if (st.ok()) {
+  Status s = env_->GetHostName(name, 100L);
+  if (s.ok()) {
     host_name_ = name;
   } else {
     Log(options_.info_log, "Can't get hostname, use localhost as host name.");
@@ -502,7 +498,7 @@ void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm,
 }
 
 // Returns the list of live files in 'sst_live' and the list
-// of all files in the filesystem in 'all_files'.
+// of all files in the filesystem in 'candidate_files'.
 // no_full_scan = true -- never do the full scan using GetChildren()
 // force = false -- don't force the full scan, except every
 //  options_.delete_obsolete_files_period_micros
@@ -554,15 +550,18 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
   versions_->AddLiveFiles(&deletion_state.sst_live);
 
   if (doing_the_full_scan) {
-    // set of all files in the directory
-    env_->GetChildren(dbname_, &deletion_state.all_files); // Ignore errors
+    // set of all files in the directory. We'll exclude files that are still
+    // alive in the subsequent processings.
+    env_->GetChildren(
+        dbname_, &deletion_state.candidate_files
+    ); // Ignore errors
 
     //Add log files in wal_dir
     if (options_.wal_dir != dbname_) {
       std::vector<std::string> log_files;
       env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
-      deletion_state.all_files.insert(
-        deletion_state.all_files.end(),
+      deletion_state.candidate_files.insert(
+        deletion_state.candidate_files.end(),
         log_files.begin(),
         log_files.end()
       );
@@ -575,11 +574,10 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
 // files in sst_delete_files and log_delete_files.
 // It is not necessary to hold the mutex when invoking this method.
 void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
-
   // check if there is anything to do
-  if (!state.all_files.size() &&
-      !state.sst_delete_files.size() &&
-      !state.log_delete_files.size()) {
+  if (state.candidate_files.empty() &&
+      state.sst_delete_files.empty() &&
+      state.log_delete_files.empty()) {
     return;
   }
 
@@ -589,100 +587,114 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
   if (state.manifest_file_number == 0) {
     return;
   }
-
-  uint64_t number;
-  FileType type;
   std::vector<std::string> old_log_files;
 
   // Now, convert live list to an unordered set, WITHOUT mutex held;
   // set is slow.
-  std::unordered_set<uint64_t> live_set(state.sst_live.begin(),
-                                        state.sst_live.end());
+  std::unordered_set<uint64_t> sst_live(
+      state.sst_live.begin(), state.sst_live.end()
+  );
 
-  state.all_files.reserve(state.all_files.size() +
-      state.sst_delete_files.size());
+  auto& candidate_files = state.candidate_files;
+  candidate_files.reserve(
+      candidate_files.size() +
+      state.sst_delete_files.size() +
+      state.log_delete_files.size());
+  // We may ignore the dbname when generating the file names.
+  const char* kDumbDbName = "";
   for (auto file : state.sst_delete_files) {
-    state.all_files.push_back(TableFileName("", file->number).substr(1));
+    candidate_files.push_back(
+        TableFileName(kDumbDbName, file->number).substr(1)
+    );
     delete file;
   }
 
-  state.all_files.reserve(state.all_files.size() +
-      state.log_delete_files.size());
-  for (auto filenum : state.log_delete_files) {
-    if (filenum > 0) {
-      state.all_files.push_back(LogFileName("", filenum).substr(1));
+  for (auto file_num : state.log_delete_files) {
+    if (file_num > 0) {
+      candidate_files.push_back(
+          LogFileName(kDumbDbName, file_num).substr(1)
+      );
     }
   }
 
-  // dedup state.all_files so we don't try to delete the same
+  // dedup state.candidate_files so we don't try to delete the same
   // file twice
-  sort(state.all_files.begin(), state.all_files.end());
-  auto unique_end = unique(state.all_files.begin(), state.all_files.end());
-
-  for (size_t i = 0; state.all_files.begin() + i < unique_end; i++) {
-    if (ParseFileName(state.all_files[i], &number, &type)) {
-      bool keep = true;
-      switch (type) {
-        case kLogFile:
-          keep = ((number >= state.log_number) ||
-                  (number == state.prev_log_number));
-          break;
-        case kDescriptorFile:
-          // Keep my manifest file, and any newer incarnations'
-          // (in case there is a race that allows other incarnations)
-          keep = (number >= state.manifest_file_number);
-          break;
-        case kTableFile:
-          keep = (live_set.find(number) != live_set.end());
-          break;
-        case kTempFile:
-          // Any temp files that are currently being written to must
-          // be recorded in pending_outputs_, which is inserted into "live"
-          keep = (live_set.find(number) != live_set.end());
-          break;
-        case kInfoLogFile:
-          keep = true;
-          if (number != 0) {
-            old_log_files.push_back(state.all_files[i]);
-          }
-          break;
-        case kCurrentFile:
-        case kDBLockFile:
-        case kIdentityFile:
-        case kMetaDatabase:
-          keep = true;
-          break;
-      }
+  sort(candidate_files.begin(), candidate_files.end());
+  candidate_files.erase(
+      unique(candidate_files.begin(), candidate_files.end()),
+      candidate_files.end()
+  );
 
-      if (!keep) {
-        if (type == kTableFile) {
-          // evict from cache
-          table_cache_->Evict(number);
+  for (const auto& to_delete : candidate_files) {
+    uint64_t number;
+    FileType type;
+    // Ignore file if we cannot recognize it.
+    if (!ParseFileName(to_delete, &number, &type)) {
+      continue;
+    }
+
+    bool keep = true;
+    switch (type) {
+      case kLogFile:
+        keep = ((number >= state.log_number) ||
+                (number == state.prev_log_number));
+        break;
+      case kDescriptorFile:
+        // Keep my manifest file, and any newer incarnations'
+        // (in case there is a race that allows other incarnations)
+        keep = (number >= state.manifest_file_number);
+        break;
+      case kTableFile:
+        keep = (sst_live.find(number) != sst_live.end());
+        break;
+      case kTempFile:
+        // Any temp files that are currently being written to must
+        // be recorded in pending_outputs_, which is inserted into "live"
+        keep = (sst_live.find(number) != sst_live.end());
+        break;
+      case kInfoLogFile:
+        keep = true;
+        if (number != 0) {
+          old_log_files.push_back(to_delete);
         }
-        std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
-            "/" + state.all_files[i];
+        break;
+      case kCurrentFile:
+      case kDBLockFile:
+      case kIdentityFile:
+      case kMetaDatabase:
+        keep = true;
+        break;
+    }
+
+    if (keep) {
+      continue;
+    }
+
+    if (type == kTableFile) {
+      // evict from cache
+      table_cache_->Evict(number);
+    }
+    std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
+        "/" + to_delete;
+    Log(options_.info_log,
+        "Delete type=%d #%lu",
+        int(type),
+        (unsigned long)number);
+
+    if (type == kLogFile &&
+        (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) {
+      Status s = env_->RenameFile(fname,
+          ArchivedLogFileName(options_.wal_dir, number));
+      if (!s.ok()) {
         Log(options_.info_log,
-            "Delete type=%d #%lu",
-            int(type),
-            (unsigned long)number);
-
-        Status st;
-        if (type == kLogFile && (options_.WAL_ttl_seconds > 0 ||
-              options_.WAL_size_limit_MB > 0)) {
-            st = env_->RenameFile(fname,
-                ArchivedLogFileName(options_.wal_dir, number));
-            if (!st.ok()) {
-              Log(options_.info_log,
-                  "RenameFile logfile #%lu FAILED -- %s\n",
-                  (unsigned long)number, st.ToString().c_str());
-            }
-        } else {
-          st = env_->DeleteFile(fname);
-          if (!st.ok()) {
-            Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n",
-                int(type), (unsigned long)number, st.ToString().c_str());
-          }
-        }
+            "RenameFile logfile #%lu FAILED -- %s\n",
+            (unsigned long)number, s.ToString().c_str());
+      }
+    } else {
+      Status s = env_->DeleteFile(fname);
+      if (!s.ok()) {
+        Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n",
+            int(type), (unsigned long)number, s.ToString().c_str());
       }
     }
   }
@@ -839,7 +851,9 @@ void DBImpl::PurgeObsoleteWALFiles() {
 
 // If externalTable is set, then apply recovered transactions
 // to that table. This is used for readonly mode.
-Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table,
+Status DBImpl::Recover(
+    VersionEdit* edit,
+    MemTable* external_table,
     bool error_if_log_file_exist) {
   mutex_.AssertHeld();
 
@@ -906,10 +920,11 @@ Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table,
     if (!s.ok()) {
       return s;
     }
-    uint64_t number;
-    FileType type;
+
     std::vector<uint64_t> logs;
     for (size_t i = 0; i < filenames.size(); i++) {
+      uint64_t number;
+      FileType type;
       if (ParseFileName(filenames[i], &number, &type)
           && type == kLogFile
           && ((number >= min_log) || (number == prev_log))) {
@@ -925,12 +940,12 @@ Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table,
 
     // Recover in the order in which the logs were generated
     std::sort(logs.begin(), logs.end());
-    for (size_t i = 0; i < logs.size(); i++) {
-      s = RecoverLogFile(logs[i], edit, &max_sequence, external_table);
+    for (const auto& log : logs) {
+      s = RecoverLogFile(log, edit, &max_sequence, external_table);
       // The previous incarnation may not have written any MANIFEST
       // records after allocating this log number.  So we manually
       // update the file number allocation counter in VersionSet.
-      versions_->MarkFileNumberUsed(logs[i]);
+      versions_->MarkFileNumberUsed(log);
     }
 
     if (s.ok()) {
@@ -1147,7 +1162,6 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
   }
   base->Unref();
 
-
   // re-acquire the most current version
   base = versions_->current();
 
@@ -3285,7 +3299,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
 
     } else {
       unique_ptr<WritableFile> lfile;
-      MemTable* memtmp = nullptr;
+      MemTable* new_mem = nullptr;
 
       // Attempt to switch to a new memtable and trigger compaction of old.
       // Do this without holding the dbmutex lock.
@@ -3306,7 +3320,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
           // Our final size should be less than write_buffer_size
           // (compression, etc) but err on the side of caution.
           lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
-          memtmp = new MemTable(
+          new_mem = new MemTable(
             internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
           new_superversion = new SuperVersion(options_.max_write_buffer_number);
         }
@@ -3315,7 +3329,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       if (!s.ok()) {
         // Avoid chewing through file number space in a tight loop.
         versions_->ReuseFileNumber(new_log_number);
-        assert (!memtmp);
+        assert (!new_mem);
         break;
       }
       logfile_number_ = new_log_number;
@@ -3325,7 +3339,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       if (force) {
         imm_.FlushRequested();
       }
-      mem_ = memtmp;
+      mem_ = new_mem;
       mem_->Ref();
       Log(options_.info_log,
           "New memtable created with log file: #%lu\n",
@@ -3806,7 +3820,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     delete impl;
     return s;
   }
-  impl->mutex_.Lock();
+  impl->mutex_.Lock();  // DBImpl::Recover() requires lock being held
   VersionEdit edit(impl->NumberLevels());
   s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
   if (s.ok()) {
@@ -3929,7 +3943,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
 
 //
 // A global method that can dump out the build version
-void dumpLeveldbBuildVersion(Logger * log) {
+void DumpLeveldbBuildVersion(Logger * log) {
   Log(log, "Git sha %s", rocksdb_build_git_sha);
   Log(log, "Compile time %s %s",
       rocksdb_build_compile_time, rocksdb_build_compile_date);
diff --git a/db/db_impl.h b/db/db_impl.h
index 5f2148c8b..a11106f91 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -11,6 +11,7 @@
 #include <deque>
 #include <set>
 #include <vector>
+
 #include "db/dbformat.h"
 #include "db/log_writer.h"
 #include "db/snapshot.h"
@@ -159,7 +160,7 @@ class DBImpl : public DB {
   // needed for CleanupIteratorState
   struct DeletionState {
     inline bool HaveSomethingToDelete() const {
-      return  all_files.size() ||
+      return  candidate_files.size() ||
         sst_delete_files.size() ||
         log_delete_files.size();
     }
@@ -167,7 +168,7 @@ class DBImpl : public DB {
     // a list of all files that we'll consider deleting
     // (every once in a while this is filled up with all files
     // in the DB directory)
-    std::vector<std::string> all_files;
+    std::vector<std::string> candidate_files;
 
     // the list of all live sst files that cannot be deleted
     std::vector<uint64_t> sst_live;
@@ -214,7 +215,7 @@ class DBImpl : public DB {
   };
 
   // Returns the list of live files in 'live' and the list
-  // of all files in the filesystem in 'all_files'.
+  // of all files in the filesystem in 'candidate_files'.
   // If force == false and the last call was less than
   // options_.delete_obsolete_files_period_micros microseconds ago,
   // it will not fill up the deletion_state
diff --git a/db/log_format.h b/db/log_format.h
index 10a31ba27..919c087e2 100644
--- a/db/log_format.h
+++ b/db/log_format.h
@@ -17,7 +17,6 @@ namespace log {
 enum RecordType {
   // Zero is reserved for preallocated files
   kZeroType = 0,
-
   kFullType = 1,
 
   // For fragments
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 9f23faba7..1a6a11956 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -74,12 +74,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
     PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
   }
 
-  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
-       iter != deleted_files_.end();
-       ++iter) {
+  for (const auto& deleted : deleted_files_) {
     PutVarint32(dst, kDeletedFile);
-    PutVarint32(dst, iter->first);   // level
-    PutVarint64(dst, iter->second);  // file number
+    PutVarint32(dst, deleted.first /* level */);
+    PutVarint64(dst, deleted.second /* file number */);
   }
 
   for (size_t i = 0; i < new_files_.size(); i++) {
diff --git a/db/version_edit.h b/db/version_edit.h
index 196914e2b..d6fac1c3c 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -75,6 +75,7 @@ class VersionEdit {
                const InternalKey& largest,
                const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno) {
+    assert(smallest_seqno <= largest_seqno);
     FileMetaData f;
     f.number = file;
     f.file_size = file_size;
@@ -82,13 +83,12 @@ class VersionEdit {
     f.largest = largest;
     f.smallest_seqno = smallest_seqno;
     f.largest_seqno = largest_seqno;
-    assert(smallest_seqno <= largest_seqno);
     new_files_.push_back(std::make_pair(level, f));
   }
 
   // Delete the specified "file" from the specified "level".
   void DeleteFile(int level, uint64_t file) {
-    deleted_files_.insert(std::make_pair(level, file));
+    deleted_files_.insert({level, file});
   }
 
   // Number of edits
@@ -104,7 +104,7 @@ class VersionEdit {
  private:
   friend class VersionSet;
 
-  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+  typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
@@ -120,9 +120,9 @@ class VersionEdit {
   bool has_next_file_number_;
   bool has_last_sequence_;
 
-  std::vector< std::pair<int, InternalKey> > compact_pointers_;
+  std::vector<std::pair<int, InternalKey>> compact_pointers_;
   DeletedFileSet deleted_files_;
-  std::vector< std::pair<int, FileMetaData> > new_files_;
+  std::vector<std::pair<int, FileMetaData>> new_files_;
 };
 
 }  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index ee6c36c0f..e2421ef92 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -958,6 +958,7 @@ class VersionSet::Builder {
         }
       }
     }
+
     delete[] levels_;
     base_->Unref();
   }
@@ -1043,19 +1044,17 @@ class VersionSet::Builder {
 
     // Delete files
     const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
-    for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
-         iter != del.end();
-         ++iter) {
-      const int level = iter->first;
-      const uint64_t number = iter->second;
+    for (const auto& del_file : del) {
+      const auto level = del_file.first;
+      const auto number = del_file.second;
       levels_[level].deleted_files.insert(number);
       CheckConsistencyForDeletes(edit, number, level);
     }
 
     // Add new files
-    for (size_t i = 0; i < edit->new_files_.size(); i++) {
-      const int level = edit->new_files_[i].first;
-      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
+    for (const auto& new_file : edit->new_files_) {
+      const int level = new_file.first;
+      FileMetaData* f = new FileMetaData(new_file.second);
       f->refs = 1;
 
       // We arrange to automatically compact this file after
@@ -1088,23 +1087,21 @@ class VersionSet::Builder {
     for (int level = 0; level < vset_->NumberLevels(); level++) {
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
-      const std::vector<FileMetaData*>& base_files = base_->files_[level];
-      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
-      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
-      const FileSet* added = levels_[level].added_files;
-      v->files_[level].reserve(base_files.size() + added->size());
-      for (FileSet::const_iterator added_iter = added->begin();
-           added_iter != added->end();
-           ++added_iter) {
+      const auto& base_files = base_->files_[level];
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      const auto& added_files = *levels_[level].added_files;
+      v->files_[level].reserve(base_files.size() + added_files.size());
+
+      for (const auto& added : added_files) {
         // Add all smaller files listed in base_
-        for (std::vector<FileMetaData*>::const_iterator bpos
-                 = std::upper_bound(base_iter, base_end, *added_iter, cmp);
+        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
              base_iter != bpos;
              ++base_iter) {
           MaybeAddFile(v, level, *base_iter);
         }
 
-        MaybeAddFile(v, level, *added_iter);
+        MaybeAddFile(v, level, added);
       }
 
       // Add remaining base files
@@ -1120,7 +1117,7 @@ class VersionSet::Builder {
     if (levels_[level].deleted_files.count(f->number) > 0) {
       // File is deleted: do nothing
     } else {
-      std::vector<FileMetaData*>* files = &v->files_[level];
+      auto* files = &v->files_[level];
       if (level > 0 && !files->empty()) {
         // Must not overlap
         assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
@@ -1210,7 +1207,9 @@ void VersionSet::AppendVersion(Version* v) {
   v->next_->prev_ = v;
 }
 
-Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
+Status VersionSet::LogAndApply(
+    VersionEdit* edit,
+    port::Mutex* mu,
     bool new_descriptor_log) {
   mu->AssertHeld();
 
@@ -1232,17 +1231,16 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
   ManifestWriter* last_writer = &w;
   assert(!manifest_writers_.empty());
   assert(manifest_writers_.front() == &w);
-  std::deque<ManifestWriter*>::iterator iter = manifest_writers_.begin();
-  for (; iter != manifest_writers_.end(); ++iter) {
-    last_writer = *iter;
-    LogAndApplyHelper(&builder, v, last_writer->edit, mu);
-    batch_edits.push_back(last_writer->edit);
+  for (const auto& writer : manifest_writers_) {
+    last_writer = writer;
+    LogAndApplyHelper(&builder, v, writer->edit, mu);
+    batch_edits.push_back(writer->edit);
   }
   builder.SaveTo(v);
 
   // Initialize new descriptor log file if necessary by creating
   // a temporary file that contains a snapshot of the current version.
-  std::string new_manifest_file;
+  std::string new_manifest_filename;
   uint64_t new_manifest_file_size = 0;
   Status s;
   // we will need this if we are creating new manifest
@@ -1256,7 +1254,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
   }
 
   if (new_descriptor_log) {
-    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+    new_manifest_filename = DescriptorFileName(dbname_, manifest_file_number_);
     edit->SetNextFile(next_file_number_);
   }
 
@@ -1271,9 +1269,10 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
     // This is fine because everything inside of this block is serialized --
     // only one thread can be here at the same time
-    if (!new_manifest_file.empty()) {
+    if (!new_manifest_filename.empty()) {
       unique_ptr<WritableFile> descriptor_file;
-      s = env_->NewWritableFile(new_manifest_file, &descriptor_file,
+      s = env_->NewWritableFile(new_manifest_filename,
+                                &descriptor_file,
                                 storage_options_);
       if (s.ok()) {
         descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
@@ -1321,7 +1320,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
     // If we just created a new descriptor file, install it by writing a
     // new CURRENT file that points to it.
-    if (s.ok() && !new_manifest_file.empty()) {
+    if (s.ok() && !new_manifest_filename.empty()) {
       s = SetCurrentFile(env_, dbname_, manifest_file_number_);
       if (s.ok() && old_manifest_file_number < manifest_file_number_) {
         // delete old manifest file
@@ -1356,9 +1355,9 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
     Log(options_->info_log, "Error in committing version %lu",
         (unsigned long)v->GetVersionNumber());
     delete v;
-    if (!new_manifest_file.empty()) {
+    if (!new_manifest_filename.empty()) {
       descriptor_log_.reset();
-      env_->DeleteFile(new_manifest_file);
+      env_->DeleteFile(new_manifest_filename);
     }
   }
 
@@ -1410,27 +1409,33 @@ Status VersionSet::Recover() {
   };
 
   // Read "CURRENT" file, which contains a pointer to the current manifest file
-  std::string current;
-  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
+  std::string manifest_filename;
+  Status s = ReadFileToString(
+      env_, CurrentFileName(dbname_), &manifest_filename
+  );
   if (!s.ok()) {
     return s;
   }
-  if (current.empty() || current[current.size()-1] != '\n') {
+  if (manifest_filename.empty() ||
+      manifest_filename.back() != '\n') {
     return Status::Corruption("CURRENT file does not end with newline");
   }
-  current.resize(current.size() - 1);
+  // remove the trailing '\n'
+  manifest_filename.resize(manifest_filename.size() - 1);
 
   Log(options_->info_log, "Recovering from manifest file:%s\n",
-      current.c_str());
+      manifest_filename.c_str());
 
-  std::string dscname = dbname_ + "/" + current;
-  unique_ptr<SequentialFile> file;
-  s = env_->NewSequentialFile(dscname, &file, storage_options_);
+  manifest_filename = dbname_ + "/" + manifest_filename;
+  unique_ptr<SequentialFile> manifest_file;
+  s = env_->NewSequentialFile(
+      manifest_filename, &manifest_file, storage_options_
+  );
   if (!s.ok()) {
     return s;
   }
   uint64_t manifest_file_size;
-  s = env_->GetFileSize(dscname, &manifest_file_size);
+  s = env_->GetFileSize(manifest_filename, &manifest_file_size);
   if (!s.ok()) {
     return s;
   }
@@ -1448,8 +1453,8 @@ Status VersionSet::Recover() {
   {
     LogReporter reporter;
     reporter.status = &s;
-    log::Reader reader(std::move(file), &reporter, true/*checksum*/,
-                       0/*initial_offset*/);
+    log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/,
+                       0 /*initial_offset*/);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -1489,7 +1494,6 @@ Status VersionSet::Recover() {
       }
     }
   }
-  file.reset();
 
   if (s.ok()) {
     if (!have_next_file) {
@@ -1529,7 +1533,7 @@ Status VersionSet::Recover() {
         "manifest_file_number is %lu, next_file_number is %lu, "
         "last_sequence is %lu, log_number is %lu,"
         "prev_log_number is %lu\n",
-        current.c_str(),
+        manifest_filename.c_str(),
         (unsigned long)manifest_file_number_,
         (unsigned long)next_file_number_,
         (unsigned long)last_sequence_,
@@ -1844,9 +1848,9 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 
   // Save files
   for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = current_->files_[level];
+    const auto& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
-      const FileMetaData* f = files[i];
+      const auto f = files[i];
       edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest,
                    f->smallest_seqno, f->largest_seqno);
     }
diff --git a/db/version_set.h b/db/version_set.h
index 579ec3346..3f8f95585 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -78,8 +78,8 @@ class Version {
   };
   void Get(const ReadOptions&, const LookupKey& key, std::string* val,
            Status* status, MergeContext* merge_context,
-           GetStats* stats, const Options& db_option, bool* value_found =
-               nullptr);
+           GetStats* stats, const Options& db_option,
+           bool* value_found = nullptr);
 
   // Adds "stats" into the current state.  Returns true if a new
   // compaction may need to be triggered, false otherwise.
@@ -172,7 +172,7 @@ class Version {
   // but files in each level are now sorted based on file
   // size. The file with the largest size is at the front.
   // This vector stores the index of the file from files_.
-  std::vector< std::vector<int> > files_by_size_;
+  std::vector<std::vector<int>> files_by_size_;
 
   // An index into files_by_size_ that specifies the first
   // file that is not yet compacted

From 774ed89c2405ee058086b099cbc8b29e243739cc Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Thu, 2 Jan 2014 11:26:57 -0800
Subject: [PATCH 43/70] Replace vector with autovector

Summary: this diff only replace the cases when we need to frequently create vector with small amount of entries. This diff doesn't aim to improve performance of a specific area, but more like a small scale test for the autovector and see how it works in real life.

Test Plan:
make check

I also ran the performance tests, however there is no performance gain/loss. All performance numbers are pretty much the same before/after the change.

Reviewers: dhruba, haobo, sdong, igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14985
---
 db/db_impl.cc       | 56 ++++++++++++++++++++----------------------
 db/db_impl.h        | 20 +++++++--------
 db/memtable_list.cc | 10 ++++----
 db/memtable_list.h  | 18 ++++++++------
 util/autovector.h   | 60 +++++++++++++--------------------------------
 util/cache.cc       |  5 ++--
 6 files changed, 70 insertions(+), 99 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 093112857..2b70ff60f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -47,6 +47,7 @@
 #include "table/block_based_table_factory.h"
 #include "table/merger.h"
 #include "table/two_level_iterator.h"
+#include "util/autovector.h"
 #include "util/auto_roll_logger.h"
 #include "util/build_version.h"
 #include "util/coding.h"
@@ -299,8 +300,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
 }
 
 DBImpl::~DBImpl() {
-  std::vector<MemTable*> to_delete;
-  to_delete.reserve(options_.max_write_buffer_number);
+  autovector<MemTable*> to_delete;
 
   // Wait for background work to finish
   if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) {
@@ -455,10 +455,6 @@ void DBImpl::MaybeDumpStats() {
 }
 
 // DBImpl::SuperVersion methods
-DBImpl::SuperVersion::SuperVersion(const int num_memtables) {
-  to_delete.resize(num_memtables);
-}
-
 DBImpl::SuperVersion::~SuperVersion() {
   for (auto td : to_delete) {
     delete td;
@@ -1114,7 +1110,7 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
 }
 
 
-Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
+Status DBImpl::WriteLevel0Table(autovector<MemTable*>& mems, VersionEdit* edit,
                                 uint64_t* filenumber) {
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
@@ -1131,15 +1127,15 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
   Status s;
   {
     mutex_.Unlock();
-    std::vector<Iterator*> list;
+    std::vector<Iterator*> memtables;
     for (MemTable* m : mems) {
       Log(options_.info_log,
           "Flushing memtable with log file: %lu\n",
           (unsigned long)m->GetLogNumber());
-      list.push_back(m->NewIterator());
+      memtables.push_back(m->NewIterator());
     }
-    Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0],
-                                        list.size());
+    Iterator* iter = NewMergingIterator(
+        env_, &internal_comparator_, &memtables[0], memtables.size());
     Log(options_.info_log,
         "Level-0 flush table #%lu: started",
         (unsigned long)meta.number);
@@ -1214,7 +1210,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
 
   // Save the contents of the earliest memtable as a new Table
   uint64_t file_number;
-  std::vector<MemTable*> mems;
+  autovector<MemTable*> mems;
   imm_.PickMemtablesToFlush(&mems);
   if (mems.empty()) {
     Log(options_.info_log, "Nothing in memstore to flush");
@@ -1316,8 +1312,7 @@ void DBImpl::ReFitLevel(int level, int target_level) {
   assert(level < NumberLevels());
 
   SuperVersion* superversion_to_free = nullptr;
-  SuperVersion* new_superversion =
-      new SuperVersion(options_.max_write_buffer_number);
+  SuperVersion* new_superversion = new SuperVersion();
 
   mutex_.Lock();
 
@@ -1750,7 +1745,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
 
 void DBImpl::BackgroundCallFlush() {
   bool madeProgress = false;
-  DeletionState deletion_state(options_.max_write_buffer_number, true);
+  DeletionState deletion_state(true);
   assert(bg_flush_scheduled_);
   MutexLock l(&mutex_);
 
@@ -1796,7 +1791,7 @@ void DBImpl::TEST_PurgeObsoleteteWAL() {
 
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
-  DeletionState deletion_state(options_.max_write_buffer_number, true);
+  DeletionState deletion_state(true);
 
   MaybeDumpStats();
 
@@ -2591,16 +2586,16 @@ namespace {
 struct IterState {
   port::Mutex* mu;
   Version* version;
-  std::vector<MemTable*> mem; // includes both mem_ and imm_
+  autovector<MemTable*> mem; // includes both mem_ and imm_
   DBImpl *db;
 };
 
 static void CleanupIteratorState(void* arg1, void* arg2) {
   IterState* state = reinterpret_cast<IterState*>(arg1);
-  DBImpl::DeletionState deletion_state(state->db->GetOptions().
-                                       max_write_buffer_number);
+  DBImpl::DeletionState deletion_state;
   state->mu->Lock();
-  for (unsigned int i = 0; i < state->mem.size(); i++) {
+  auto mems_size = state->mem.size();
+  for (size_t i = 0; i < mems_size; i++) {
     MemTable* m = state->mem[i]->Unref();
     if (m != nullptr) {
       deletion_state.memtables_to_free.push_back(m);
@@ -2620,7 +2615,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
   IterState* cleanup = new IterState;
   MemTable* mutable_mem;
-  std::vector<MemTable*> immutables;
+  autovector<MemTable*> immutables;
   Version* version;
 
   // Collect together all needed child iterators for mem
@@ -2638,16 +2633,17 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   version = versions_->current();
   mutex_.Unlock();
 
-  std::vector<Iterator*> list;
-  list.push_back(mutable_mem->NewIterator(options));
+  std::vector<Iterator*> memtables;
+  memtables.push_back(mutable_mem->NewIterator(options));
   cleanup->mem.push_back(mutable_mem);
   for (MemTable* m : immutables) {
-    list.push_back(m->NewIterator(options));
+    memtables.push_back(m->NewIterator(options));
     cleanup->mem.push_back(m);
   }
-  version->AddIterators(options, storage_options_, &list);
-  Iterator* internal_iter =
-      NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
+  version->AddIterators(options, storage_options_, &memtables);
+  Iterator* internal_iter = NewMergingIterator(
+      env_, &internal_comparator_, memtables.data(), memtables.size()
+  );
   cleanup->version = version;
   cleanup->mu = &mutex_;
   cleanup->db = this;
@@ -2802,7 +2798,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
   StartPerfTimer(&snapshot_timer);
 
   SequenceNumber snapshot;
-  std::vector<MemTable*> to_delete;
+  autovector<MemTable*> to_delete;
 
   mutex_.Lock();
   if (options.snapshot != nullptr) {
@@ -3322,7 +3318,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
           lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
           new_mem = new MemTable(
             internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
-          new_superversion = new SuperVersion(options_.max_write_buffer_number);
+          new_superversion = new SuperVersion();
         }
       }
       mutex_.Lock();
@@ -3703,7 +3699,7 @@ Status DBImpl::DeleteFile(std::string name) {
   FileMetaData metadata;
   int maxlevel = NumberLevels();
   VersionEdit edit(maxlevel);
-  DeletionState deletion_state(0, true);
+  DeletionState deletion_state(true);
   {
     MutexLock l(&mutex_);
     status = versions_->GetMetadataForFile(number, &level, &metadata);
diff --git a/db/db_impl.h b/db/db_impl.h
index 3b02ee9b1..b8056edd5 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -7,6 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
+
 #include <atomic>
 #include <deque>
 #include <set>
@@ -16,13 +17,14 @@
 #include "db/log_writer.h"
 #include "db/snapshot.h"
 #include "db/version_edit.h"
+#include "memtable_list.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
-#include "port/port.h"
+#include "util/autovector.h"
 #include "util/stats_logger.h"
-#include "memtable_list.h"
 
 namespace rocksdb {
 
@@ -138,10 +140,10 @@ class DBImpl : public DB {
     // We need to_delete because during Cleanup(), imm.UnrefAll() returns
     // all memtables that we need to free through this vector. We then
     // delete all those memtables outside of mutex, during destruction
-    std::vector<MemTable*> to_delete;
+    autovector<MemTable*> to_delete;
 
     // should be called outside the mutex
-    explicit SuperVersion(const int num_memtables = 0);
+    SuperVersion() = default;
     ~SuperVersion();
     SuperVersion* Ref();
     // Returns true if this was the last reference and caller should
@@ -180,7 +182,7 @@ class DBImpl : public DB {
     std::vector<uint64_t> log_delete_files;
 
     // a list of memtables to be free
-    std::vector<MemTable *> memtables_to_free;
+    autovector<MemTable*> memtables_to_free;
 
     SuperVersion* superversion_to_free; // if nullptr nothing to free
 
@@ -190,15 +192,13 @@ class DBImpl : public DB {
     // that corresponds to the set of files in 'live'.
     uint64_t manifest_file_number, log_number, prev_log_number;
 
-    explicit DeletionState(const int num_memtables = 0,
-                           bool create_superversion = false) {
+    explicit DeletionState(bool create_superversion = false) {
       manifest_file_number = 0;
       log_number = 0;
       prev_log_number = 0;
-      memtables_to_free.reserve(num_memtables);
       superversion_to_free = nullptr;
       new_superversion =
-          create_superversion ? new SuperVersion(num_memtables) : nullptr;
+          create_superversion ? new SuperVersion() : nullptr;
     }
 
     ~DeletionState() {
@@ -283,7 +283,7 @@ class DBImpl : public DB {
   // for the entire period. The second method WriteLevel0Table supports
   // concurrent flush memtables to storage.
   Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
-  Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
+  Status WriteLevel0Table(autovector<MemTable*>& mems, VersionEdit* edit,
                                 uint64_t* filenumber);
 
   uint64_t SlowdownAmount(int n, int top, int bottom);
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 27e12b945..7197a92ea 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -31,7 +31,7 @@ void MemTableList::RefAll() {
 // Drop reference count on all underling memtables. If the
 // refcount of an underlying memtable drops to zero, then
 // return it in to_delete vector.
-void MemTableList::UnrefAll(std::vector<MemTable*>* to_delete) {
+void MemTableList::UnrefAll(autovector<MemTable*>* to_delete) {
   for (auto &memtable : memlist_) {
     MemTable* m = memtable->Unref();
     if (m != nullptr) {
@@ -58,7 +58,7 @@ bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) {
 }
 
 // Returns the memtables that need to be flushed.
-void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
+void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
   for (auto it = memlist_.rbegin(); it != memlist_.rend(); it++) {
     MemTable* m = *it;
     if (!m->flush_in_progress_) {
@@ -76,12 +76,12 @@ void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
 
 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
-                      const std::vector<MemTable*> &mems,
+                      const autovector<MemTable*> &mems,
                       VersionSet* vset, Status flushStatus,
                       port::Mutex* mu, Logger* info_log,
                       uint64_t file_number,
                       std::set<uint64_t>& pending_outputs,
-                      std::vector<MemTable*>* to_delete) {
+                      autovector<MemTable*>* to_delete) {
   mu->AssertHeld();
 
   // If the flush was not successful, then just reset state.
@@ -213,7 +213,7 @@ bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s,
   return false;
 }
 
-void MemTableList::GetMemTables(std::vector<MemTable*>* output) {
+void MemTableList::GetMemTables(autovector<MemTable*>* output) {
   for (auto &memtable : memlist_) {
     output->push_back(memtable);
   }
diff --git a/db/memtable_list.h b/db/memtable_list.h
index ed353c8b8..9831d7621 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -3,15 +3,17 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
-
 #pragma once
+
 #include <string>
 #include <list>
 #include <deque>
-#include "rocksdb/db.h"
+
 #include "db/dbformat.h"
+#include "db/memtable.h"
 #include "db/skiplist.h"
-#include "memtable.h"
+#include "rocksdb/db.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -47,7 +49,7 @@ class MemTableList {
   // Drop reference count on all underling memtables. If the refcount
   // on an underlying memtable drops to zero, then return it in
   // to_delete vector.
-  void UnrefAll(std::vector<MemTable*>* to_delete);
+  void UnrefAll(autovector<MemTable*>* to_delete);
 
   // Returns the total number of memtables in the list
   int size();
@@ -58,15 +60,15 @@ class MemTableList {
 
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
-  void PickMemtablesToFlush(std::vector<MemTable*>* mems);
+  void PickMemtablesToFlush(autovector<MemTable*>* mems);
 
   // Commit a successful flush in the manifest file
-  Status InstallMemtableFlushResults(const std::vector<MemTable*> &m,
+  Status InstallMemtableFlushResults(const autovector<MemTable*> &m,
                       VersionSet* vset, Status flushStatus,
                       port::Mutex* mu, Logger* info_log,
                       uint64_t file_number,
                       std::set<uint64_t>& pending_outputs,
-                      std::vector<MemTable*>* to_delete);
+                      autovector<MemTable*>* to_delete);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
@@ -81,7 +83,7 @@ class MemTableList {
            MergeContext& merge_context, const Options& options);
 
   // Returns the list of underlying memtables.
-  void GetMemTables(std::vector<MemTable*>* list);
+  void GetMemTables(autovector<MemTable*>* list);
 
   // Request a flush of all existing memtables to storage
   void FlushRequested() { flush_requested_ = true; }
diff --git a/util/autovector.h b/util/autovector.h
index 9998e2956..812a61795 100644
--- a/util/autovector.h
+++ b/util/autovector.h
@@ -57,11 +57,9 @@ class autovector {
     typedef std::random_access_iterator_tag iterator_category;
 
     iterator_impl(TAutoVector* vect, size_t index)
-      : vect_(vect)
-      , index_(index) {
-    };
+        : vect_(vect), index_(index) {};
     iterator_impl(const iterator_impl&) = default;
-    ~iterator_impl() { }
+    ~iterator_impl() {}
     iterator_impl& operator=(const iterator_impl&) = default;
 
     // -- Advancement
@@ -130,9 +128,7 @@ class autovector {
       return index_ == other.index_;
     }
 
-    bool operator!=(const self_type& other) const {
-      return !(*this == other);
-    }
+    bool operator!=(const self_type& other) const { return !(*this == other); }
 
     bool operator>(const self_type& other) const {
       assert(vect_ == other.vect_);
@@ -174,13 +170,9 @@ class autovector {
     return vect_.capacity() == 0;
   }
 
-  size_type size() const {
-    return num_stack_items_ + vect_.size();
-  }
+  size_type size() const { return num_stack_items_ + vect_.size(); }
 
-  bool empty() const {
-    return size() == 0;
-  }
+  bool empty() const { return size() == 0; }
 
   // will not check boundry
   const_reference operator[](size_type n) const {
@@ -235,11 +227,9 @@ class autovector {
     }
   }
 
-  void push_back(const T& item) {
-    push_back(value_type(item));
-  }
+  void push_back(const T& item) { push_back(value_type(item)); }
 
-  template<class... Args>
+  template <class... Args>
   void emplace_back(Args&&... args) {
     push_back(value_type(args...));
   }
@@ -261,13 +251,9 @@ class autovector {
   // -- Copy and Assignment
   autovector& assign(const autovector& other);
 
-  autovector(const autovector& other) {
-    assign(other);
-  }
+  autovector(const autovector& other) { assign(other); }
 
-  autovector& operator=(const autovector& other) {
-    return assign(other);
-  }
+  autovector& operator=(const autovector& other) { return assign(other); }
 
   // move operation are disallowed since it is very hard to make sure both
   // autovectors are allocated from the same function stack.
@@ -275,41 +261,29 @@ class autovector {
   autovector(autovector&& other) = delete;
 
   // -- Iterator Operations
-  iterator begin() {
-    return iterator(this, 0);
-  }
+  iterator begin() { return iterator(this, 0); }
 
-  const_iterator begin() const {
-    return const_iterator(this, 0);
-  }
+  const_iterator begin() const { return const_iterator(this, 0); }
 
-  iterator end() {
-    return iterator(this, this->size());
-  }
+  iterator end() { return iterator(this, this->size()); }
 
-  const_iterator end() const {
-    return const_iterator(this, this->size());
-  }
+  const_iterator end() const { return const_iterator(this, this->size()); }
 
-  reverse_iterator rbegin() {
-    return reverse_iterator(end());
-  }
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
 
   const_reverse_iterator rbegin() const {
     return const_reverse_iterator(end());
   }
 
-  reverse_iterator rend() {
-    return reverse_iterator(begin());
-  }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
 
   const_reverse_iterator rend() const {
     return const_reverse_iterator(begin());
   }
 
  private:
-  size_type num_stack_items_ = 0; // current number of items
-  value_type values_[kSize]; // the first `kSize` items
+  size_type num_stack_items_ = 0;  // current number of items
+  value_type values_[kSize];       // the first `kSize` items
   // used only if there are more than `kSize` items.
   std::vector<T> vect_;
 };
diff --git a/util/cache.cc b/util/cache.cc
index ddd808b41..143c6957a 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -10,10 +10,10 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <vector>
 
 #include "rocksdb/cache.h"
 #include "port/port.h"
+#include "util/autovector.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
 
@@ -264,8 +264,7 @@ Cache::Handle* LRUCache::Insert(
 
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       malloc(sizeof(LRUHandle)-1 + key.size()));
-  std::vector<LRUHandle*> last_reference_list;
-  last_reference_list.reserve(1);
+  autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
   e->deleter = deleter;

From 5e7d5629c75eaca4a020a4c3dc7e1c343a0502c8 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Fri, 3 Jan 2014 10:53:21 -0800
Subject: [PATCH 44/70] Fix the valgrind issues

---
 db/db_test.cc             | 20 ++++++++++++++------
 db/plain_table_db_test.cc | 16 ++++++++++------
 table/table_test.cc       |  6 +++++-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 1e39454df..5b21b2f1b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -240,6 +240,8 @@ class SpecialEnv : public EnvWrapper {
 class DBTest {
  private:
   const FilterPolicy* filter_policy_;
+  static std::unique_ptr<const SliceTransform> prefix_1_transform;
+  static std::unique_ptr<const SliceTransform> noop_transform;
 
  protected:
   // Sequence of option configurations to try
@@ -356,13 +358,13 @@ class DBTest {
         break;
       case kPlainTableFirstBytePrefix:
         options.table_factory.reset(new PlainTableFactory());
-        options.prefix_extractor = NewFixedPrefixTransform(1);
+        options.prefix_extractor = prefix_1_transform.get();
         options.allow_mmap_reads = true;
         options.max_sequential_skip_in_iterations = 999999;
         break;
       case kPlainTableAllBytesPrefix:
         options.table_factory.reset(new PlainTableFactory());
-        options.prefix_extractor = NewNoopTransform();
+        options.prefix_extractor = noop_transform.get();
         options.allow_mmap_reads = true;
         options.max_sequential_skip_in_iterations = 999999;
         break;
@@ -694,6 +696,10 @@ class DBTest {
     delete iter;
   }
 };
+std::unique_ptr<const SliceTransform> DBTest::prefix_1_transform(
+    NewFixedPrefixTransform(1));
+std::unique_ptr<const SliceTransform> DBTest::noop_transform(
+    NewNoopTransform());
 
 static std::string Key(int i) {
   char buf[100];
@@ -4694,20 +4700,22 @@ TEST(DBTest, PrefixScan) {
   snprintf(buf, sizeof(buf), "03______:");
   prefix = Slice(buf, 8);
   key = Slice(buf, 9);
-  auto prefix_extractor = NewFixedPrefixTransform(8);
   // db configs
   env_->count_random_reads_ = true;
   Options options = CurrentOptions();
   options.env = env_;
   options.no_block_cache = true;
-  options.filter_policy =  NewBloomFilterPolicy(10);
-  options.prefix_extractor = prefix_extractor;
+  options.filter_policy = NewBloomFilterPolicy(10);
+  options.prefix_extractor = NewFixedPrefixTransform(8);
   options.whole_key_filtering = false;
   options.disable_auto_compactions = true;
   options.max_background_compactions = 2;
   options.create_if_missing = true;
   options.disable_seek_compaction = true;
-  options.memtable_factory.reset(NewHashSkipListRepFactory(prefix_extractor));
+  // Tricky: options.prefix_extractor will be released by
+  // NewHashSkipListRepFactory after use.
+  options.memtable_factory.reset(
+      NewHashSkipListRepFactory(options.prefix_extractor));
 
   // prefix specified, with blooms: 2 RAND I/Os
   // SeekToFirst
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 17f871e4c..1ead9729a 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -35,16 +35,17 @@ using std::unique_ptr;
 namespace rocksdb {
 
 class PlainTableDBTest {
-protected:
-public:
+ protected:
+ private:
   std::string dbname_;
   Env* env_;
   DB* db_;
 
   Options last_options_;
+  static std::unique_ptr<const SliceTransform> prefix_transform;
 
-  PlainTableDBTest() :
-      env_(Env::Default()) {
+ public:
+  PlainTableDBTest() : env_(Env::Default()) {
     dbname_ = test::TmpDir() + "/plain_table_db_test";
     ASSERT_OK(DestroyDB(dbname_, Options()));
     db_ = nullptr;
@@ -60,7 +61,7 @@ public:
   Options CurrentOptions() {
     Options options;
     options.table_factory.reset(new PlainTableFactory(16, 2, 0.8));
-    options.prefix_extractor = NewFixedPrefixTransform(8);
+    options.prefix_extractor = prefix_transform.get();
     options.allow_mmap_reads = true;
     return options;
   }
@@ -167,8 +168,11 @@ public:
   }
 };
 
+std::unique_ptr<const SliceTransform> PlainTableDBTest::prefix_transform(
+    NewFixedPrefixTransform(8));
+
 TEST(PlainTableDBTest, Empty) {
-  ASSERT_TRUE(db_ != nullptr);
+  ASSERT_TRUE(dbfull() != nullptr);
   ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
 }
 
diff --git a/table/table_test.cc b/table/table_test.cc
index bff8ee529..a36e726a1 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -647,7 +647,7 @@ class Harness {
       case PLAIN_TABLE_FULL_STR_PREFIX:
         support_prev_ = false;
         only_support_prefix_seek_ = true;
-        options_.prefix_extractor = NewNoopTransform();
+        options_.prefix_extractor = noop_transform.get();
         options_.allow_mmap_reads = true;
         options_.table_factory.reset(new PlainTableFactory());
         constructor_ = new TableConstructor(options_.comparator, true);
@@ -849,8 +849,12 @@ class Harness {
   bool support_prev_;
   bool only_support_prefix_seek_;
   shared_ptr<Comparator> internal_comparator_;
+  static std::unique_ptr<const SliceTransform> noop_transform;
 };
 
+std::unique_ptr<const SliceTransform> Harness::noop_transform(
+    NewNoopTransform());
+
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
   if (!result) {

From 8c4eb71b5d173a22523aec741d33369684414ce8 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Fri, 3 Jan 2014 18:27:33 -0800
Subject: [PATCH 45/70] Fix one more valgrind error in table_test

---
 table/table_test.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/table/table_test.cc b/table/table_test.cc
index a36e726a1..c6f7d2275 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -638,7 +638,7 @@ class Harness {
       case PLAIN_TABLE_SEMI_FIXED_PREFIX:
         support_prev_ = false;
         only_support_prefix_seek_ = true;
-        options_.prefix_extractor = new FixedOrLessPrefixTransform(2);
+        options_.prefix_extractor = prefix_transform.get();
         options_.allow_mmap_reads = true;
         options_.table_factory.reset(new PlainTableFactory());
         constructor_ = new TableConstructor(options_.comparator, true);
@@ -850,10 +850,13 @@ class Harness {
   bool only_support_prefix_seek_;
   shared_ptr<Comparator> internal_comparator_;
   static std::unique_ptr<const SliceTransform> noop_transform;
+  static std::unique_ptr<const SliceTransform> prefix_transform;
 };
 
 std::unique_ptr<const SliceTransform> Harness::noop_transform(
     NewNoopTransform());
+std::unique_ptr<const SliceTransform> Harness::prefix_transform(
+    new FixedOrLessPrefixTransform(2));
 
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);

From 424a524ac91fb58269b7d672717c5fc830cd2f11 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 27 Dec 2013 12:56:27 -0800
Subject: [PATCH 46/70] [Performance Branch] A Hashed Linked List Based Mem
 Table

Summary:
Implement a mem table, in which keys are hashed based on prefixes. In each bucket, entries are organized in a sorted linked list. It has the same thread safety guarantee as skip list.

The motivation is to optimize memory usage for the case that prefix hashing is primary way of seeking to the entry. Compared to hash skip list implementation, this implementation is more memory efficient, but inside each bucket, search is always linear. The target scenario is that there are only very limited number of records in each hash bucket.

Test Plan: Add a test case in db_test

Reviewers: haobo, kailiu, dhruba

Reviewed By: haobo

CC: igor, nkg-, leveldb

Differential Revision: https://reviews.facebook.net/D14979
---
 db/db_test.cc                 |   7 +
 db/prefix_test.cc             | 376 ++++++++++++++-------------
 include/rocksdb/memtablerep.h |   7 +
 util/hash_linklist_rep.cc     | 462 ++++++++++++++++++++++++++++++++++
 util/hash_linklist_rep.h      |  39 +++
 5 files changed, 716 insertions(+), 175 deletions(-)
 create mode 100644 util/hash_linklist_rep.cc
 create mode 100644 util/hash_linklist_rep.h

diff --git a/db/db_test.cc b/db/db_test.cc
index 5b21b2f1b..8e2bc9f27 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -29,6 +29,7 @@
 #include "util/mutexlock.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "util/hash_linklist_rep.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -250,6 +251,7 @@ class DBTest {
     kPlainTableFirstBytePrefix,
     kPlainTableAllBytesPrefix,
     kVectorRep,
+    kHashLinkList,
     kMergePut,
     kFilter,
     kUncompressed,
@@ -403,6 +405,10 @@ class DBTest {
       case kVectorRep:
         options.memtable_factory.reset(new VectorRepFactory(100));
         break;
+      case kHashLinkList:
+      options.memtable_factory.reset(
+          NewHashLinkListRepFactory(NewFixedPrefixTransform(1), 4));
+        break;
       case kUniversalCompaction:
         options.compaction_style = kCompactionStyleUniversal;
         break;
@@ -4521,6 +4527,7 @@ TEST(DBTest, Randomized) {
       int p = rnd.Uniform(100);
       int minimum = 0;
       if (option_config_ == kHashSkipList ||
+          option_config_ == kHashLinkList ||
           option_config_ == kPlainTableFirstBytePrefix) {
         minimum = 1;
       }
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index f66091d11..66cef92cb 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -109,20 +109,6 @@ class PrefixTest {
       FLAGS_min_write_buffer_number_to_merge;
 
     options.comparator = new TestKeyComparator();
-    if (FLAGS_use_prefix_hash_memtable) {
-      auto prefix_extractor = NewFixedPrefixTransform(8);
-      options.prefix_extractor = prefix_extractor;
-      if (FLAGS_use_nolock_version) {
-        options.memtable_factory.reset(NewHashSkipListRepFactory(
-                                         prefix_extractor, FLAGS_bucket_count,
-                                         FLAGS_skiplist_height));
-      } else {
-        options.memtable_factory =
-          std::make_shared<rocksdb::PrefixHashRepFactory>(
-            prefix_extractor, FLAGS_bucket_count, FLAGS_num_locks);
-      }
-    }
-
     options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
     options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
 
@@ -130,216 +116,256 @@ class PrefixTest {
     ASSERT_OK(s);
     return std::shared_ptr<DB>(db);
   }
+
+  bool NextOptions() {
+    // skip some options
+    option_config_++;
+    if (option_config_ < kEnd) {
+      auto prefix_extractor = NewFixedPrefixTransform(8);
+      options.prefix_extractor = prefix_extractor;
+      switch(option_config_) {
+        case kHashSkipList:
+          options.memtable_factory.reset(
+              NewHashSkipListRepFactory(options.prefix_extractor,
+                                        FLAGS_bucket_count,
+                                        FLAGS_skiplist_height));
+          return true;
+        case kHashLinkList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(options.prefix_extractor,
+                                        FLAGS_bucket_count));
+          return true;
+        default:
+          return false;
+      }
+    }
+    return false;
+  }
+
+  PrefixTest() : option_config_(kBegin) { }
   ~PrefixTest() {
     delete options.comparator;
   }
  protected:
+  enum OptionConfig {
+    kBegin,
+    kHashSkipList,
+    kHashLinkList,
+    kEnd
+  };
+  int option_config_;
   Options options;
 };
 
 TEST(PrefixTest, DynamicPrefixIterator) {
+  while (NextOptions()) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+        << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
 
-  DestroyDB(kDbName, Options());
-  auto db = OpenDb();
-  WriteOptions write_options;
-  ReadOptions read_options;
+    if (FLAGS_random_prefix) {
+      std::random_shuffle(prefixes.begin(), prefixes.end());
+    }
 
-  std::vector<uint64_t> prefixes;
-  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
-    prefixes.push_back(i);
-  }
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
 
-  if (FLAGS_random_prefix) {
-    std::random_shuffle(prefixes.begin(), prefixes.end());
-  }
+    // insert x random prefix, each with y continuous element.
+    for (auto prefix : prefixes) {
+       for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
+
+        Slice key = TestKeyToSlice(test_key);
+        std::string value(FLAGS_value_size, 0);
+
+        perf_context.Reset();
+        StopWatchNano timer(Env::Default(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(perf_context.user_key_comparison_count);
+      }
+    }
+
+    std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+              << "Put time: \n" << hist_put_time.ToString();
 
-  HistogramImpl hist_put_time;
-  HistogramImpl hist_put_comparison;
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
 
-  // insert x random prefix, each with y continuous element.
-  for (auto prefix : prefixes) {
-     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
-      TestKey test_key(prefix, sorted);
+    if (FLAGS_use_prefix_hash_memtable) {
+      read_options.prefix_seek = true;
+    }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
       Slice key = TestKeyToSlice(test_key);
-      std::string value(FLAGS_value_size, 0);
+      std::string value = "v" + std::to_string(0);
 
       perf_context.Reset();
       StopWatchNano timer(Env::Default(), true);
-      ASSERT_OK(db->Put(write_options, key, value));
-      hist_put_time.Add(timer.ElapsedNanos());
-      hist_put_comparison.Add(perf_context.user_key_comparison_count);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key); iter->Valid(); iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        auto test_key = SliceToTestKey(iter->key());
+        if (test_key->prefix != prefix) break;
+        total_keys++;
+      }
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
     }
-  }
 
-  std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
-            << "Put time: \n" << hist_put_time.ToString();
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString()
+              << "Seek time: \n"
+              << hist_seek_time.ToString();
 
-  // test seek existing keys
-  HistogramImpl hist_seek_time;
-  HistogramImpl hist_seek_comparison;
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
 
-  if (FLAGS_use_prefix_hash_memtable) {
-    read_options.prefix_seek = true;
-  }
-  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
-
-  for (auto prefix : prefixes) {
-    TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
-    Slice key = TestKeyToSlice(test_key);
-    std::string value = "v" + std::to_string(0);
-
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    uint64_t total_keys = 0;
-    for (iter->Seek(key); iter->Valid(); iter->Next()) {
-      if (FLAGS_trigger_deadlock) {
-        std::cout << "Behold the deadlock!\n";
-        db->Delete(write_options, iter->key());
-      }
-      auto test_key = SliceToTestKey(iter->key());
-      if (test_key->prefix != prefix) break;
-      total_keys++;
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 10000;
+         prefix++) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
     }
-    hist_seek_time.Add(timer.ElapsedNanos());
-    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
-  }
 
-  std::cout << "Seek key comparison: \n"
-            << hist_seek_comparison.ToString()
-            << "Seek time: \n"
-            << hist_seek_time.ToString();
-
-  // test non-existing keys
-  HistogramImpl hist_no_seek_time;
-  HistogramImpl hist_no_seek_comparison;
-
-  for (auto prefix = FLAGS_total_prefixes;
-       prefix < FLAGS_total_prefixes + 10000;
-       prefix++) {
-    TestKey test_key(prefix, 0);
-    Slice key = TestKeyToSlice(test_key);
-
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    iter->Seek(key);
-    hist_no_seek_time.Add(timer.ElapsedNanos());
-    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_TRUE(!iter->Valid());
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
   }
-
-  std::cout << "non-existing Seek key comparison: \n"
-            << hist_no_seek_comparison.ToString()
-            << "non-existing Seek time: \n"
-            << hist_no_seek_time.ToString();
 }
 
 TEST(PrefixTest, PrefixHash) {
+  while (NextOptions()) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+        << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
 
-  DestroyDB(kDbName, Options());
-  auto db = OpenDb();
-  WriteOptions write_options;
-  ReadOptions read_options;
-
-  std::vector<uint64_t> prefixes;
-  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
-    prefixes.push_back(i);
-  }
-
-  if (FLAGS_random_prefix) {
-    std::random_shuffle(prefixes.begin(), prefixes.end());
-  }
+    if (FLAGS_random_prefix) {
+      std::random_shuffle(prefixes.begin(), prefixes.end());
+    }
 
-  // insert x random prefix, each with y continuous element.
-  HistogramImpl hist_put_time;
-  HistogramImpl hist_put_comparison;
+    // insert x random prefix, each with y continuous element.
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
 
-  for (auto prefix : prefixes) {
-     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
-      TestKey test_key(prefix, sorted);
+    for (auto prefix : prefixes) {
+       for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
 
-      Slice key = TestKeyToSlice(test_key);
-      std::string value = "v" + std::to_string(sorted);
+        Slice key = TestKeyToSlice(test_key);
+        std::string value = "v" + std::to_string(sorted);
 
-      perf_context.Reset();
-      StopWatchNano timer(Env::Default(), true);
-      ASSERT_OK(db->Put(write_options, key, value));
-      hist_put_time.Add(timer.ElapsedNanos());
-      hist_put_comparison.Add(perf_context.user_key_comparison_count);
+        perf_context.Reset();
+        StopWatchNano timer(Env::Default(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(perf_context.user_key_comparison_count);
+      }
     }
-  }
 
-  std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
-            << "Put time: \n" << hist_put_time.ToString();
+    std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+              << "Put time: \n" << hist_put_time.ToString();
 
 
-  // test seek existing keys
-  HistogramImpl hist_seek_time;
-  HistogramImpl hist_seek_comparison;
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
 
-  for (auto prefix : prefixes) {
-    TestKey test_key(prefix, 0);
-    Slice key = TestKeyToSlice(test_key);
-    std::string value = "v" + std::to_string(0);
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(0);
 
-    Slice key_prefix;
-    if (FLAGS_use_prefix_hash_memtable) {
-      key_prefix = options.prefix_extractor->Transform(key);
-      read_options.prefix = &key_prefix;
-    }
-    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      Slice key_prefix;
+      if (FLAGS_use_prefix_hash_memtable) {
+        key_prefix = options.prefix_extractor->Transform(key);
+        read_options.prefix = &key_prefix;
+      }
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    uint64_t total_keys = 0;
-    for (iter->Seek(key); iter->Valid(); iter->Next()) {
-      if (FLAGS_trigger_deadlock) {
-        std::cout << "Behold the deadlock!\n";
-        db->Delete(write_options, iter->key());
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key); iter->Valid(); iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        auto test_key = SliceToTestKey(iter->key());
+        if (test_key->prefix != prefix) break;
+        total_keys++;
       }
-      auto test_key = SliceToTestKey(iter->key());
-      if (test_key->prefix != prefix) break;
-      total_keys++;
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
     }
-    hist_seek_time.Add(timer.ElapsedNanos());
-    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
-  }
 
-  std::cout << "Seek key comparison: \n"
-            << hist_seek_comparison.ToString()
-            << "Seek time: \n"
-            << hist_seek_time.ToString();
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString()
+              << "Seek time: \n"
+              << hist_seek_time.ToString();
 
-  // test non-existing keys
-  HistogramImpl hist_no_seek_time;
-  HistogramImpl hist_no_seek_comparison;
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
 
-  for (auto prefix = FLAGS_total_prefixes;
-       prefix < FLAGS_total_prefixes + 100;
-       prefix++) {
-    TestKey test_key(prefix, 0);
-    Slice key = TestKeyToSlice(test_key);
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 100;
+         prefix++) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
 
-    if (FLAGS_use_prefix_hash_memtable) {
-      Slice key_prefix = options.prefix_extractor->Transform(key);
-      read_options.prefix = &key_prefix;
+      if (FLAGS_use_prefix_hash_memtable) {
+        Slice key_prefix = options.prefix_extractor->Transform(key);
+        read_options.prefix = &key_prefix;
+      }
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
     }
-    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    iter->Seek(key);
-    hist_no_seek_time.Add(timer.ElapsedNanos());
-    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_TRUE(!iter->Valid());
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
   }
-
-  std::cout << "non-existing Seek key comparison: \n"
-            << hist_no_seek_comparison.ToString()
-            << "non-existing Seek time: \n"
-            << hist_no_seek_time.ToString();
 }
 
 }
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index c50c7b61a..e24030ddc 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -268,6 +268,13 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
   int32_t skiplist_height = 4, int32_t skiplist_branching_factor = 4
 );
 
+// The factory is to create memtables with a hashed linked list:
+// it contains a fixed array of buckets, each pointing to a sorted single
+// linked list (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+    const SliceTransform* transform, size_t bucket_count = 50000);
+
 }
 
 #endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
new file mode 100644
index 000000000..e53bffbb6
--- /dev/null
+++ b/util/hash_linklist_rep.cc
@@ -0,0 +1,462 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "util/hash_linklist_rep.h"
+
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/arena.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "port/port.h"
+#include "port/atomic_pointer.h"
+#include "util/murmurhash.h"
+#include "db/memtable.h"
+#include "db/skiplist.h"
+
+namespace rocksdb {
+namespace {
+
+typedef const char* Key;
+
+struct Node {
+  explicit Node(const Key& k) :
+      key(k) {
+  }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next() {
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_.Acquire_Load());
+  }
+  void SetNext(Node* x) {
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_.Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next() {
+    return reinterpret_cast<Node*>(next_.NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(Node* x) {
+    next_.NoBarrier_Store(x);
+  }
+
+private:
+  port::AtomicPointer next_;
+};
+
+class HashLinkListRep : public MemTableRep {
+ public:
+  HashLinkListRep(MemTableRep::KeyComparator& compare, Arena* arena,
+                  const SliceTransform* transform, size_t bucket_size);
+
+  virtual void Insert(const char* key) override;
+
+  virtual bool Contains(const char* key) const override;
+
+  virtual size_t ApproximateMemoryUsage() override;
+
+  virtual ~HashLinkListRep();
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
+      const Slice& slice) override;
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator(
+      const Slice& prefix) override;
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
+      override;
+
+ private:
+  friend class DynamicIterator;
+  typedef SkipList<const char*, MemTableRep::KeyComparator&> FullList;
+
+  size_t bucket_size_;
+
+  // Maps slices (which are transformed user keys) to buckets of keys sharing
+  // the same transform.
+  port::AtomicPointer* buckets_;
+
+  // The user-supplied transform whose domain is the user keys.
+  const SliceTransform* transform_;
+
+  MemTableRep::KeyComparator& compare_;
+  // immutable after construction
+  Arena* const arena_;
+
+  bool BucketContains(Node* head, const Key& key) const;
+
+  size_t GetHash(const Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+  }
+
+  Node* GetBucket(size_t i) const {
+    return static_cast<Node*>(buckets_[i].Acquire_Load());
+  }
+
+  Node* GetBucket(const Slice& slice) const {
+    return GetBucket(GetHash(slice));
+  }
+
+  Node* NewNode(const Key& key) {
+    char* mem = arena_->AllocateAligned(sizeof(Node));
+    return new (mem) Node(key);
+  }
+
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  bool KeyIsAfterNode(const Key& key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, key) < 0);
+  }
+
+  Node* FindGreaterOrEqualInBucket(Node* head, const Key& key) const;
+
+  class FullListIterator : public MemTableRep::Iterator {
+   public:
+    explicit FullListIterator(FullList* list)
+      : iter_(list) {}
+
+    virtual ~FullListIterator() {
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return iter_.Valid();
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      assert(Valid());
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+      const char* encoded_key =
+          (memtable_key != nullptr) ?
+              memtable_key : EncodeKey(&tmp_, internal_key);
+      iter_.Seek(encoded_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      iter_.SeekToFirst();
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      iter_.SeekToLast();
+    }
+   private:
+    FullList::Iterator iter_;
+    std::string tmp_;       // For passing to EncodeKey
+  };
+
+  class Iterator : public MemTableRep::Iterator {
+   public:
+    explicit Iterator(const HashLinkListRep* const hash_link_list_rep,
+                      Node* head) :
+        hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) {
+    }
+
+    virtual ~Iterator() {
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return node_ != nullptr;
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return node_->key;
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      node_ = node_->Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+      const char* encoded_key =
+          (memtable_key != nullptr) ?
+              memtable_key : EncodeKey(&tmp_, internal_key);
+      node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
+                                                              encoded_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+   protected:
+    void Reset(Node* head) {
+      head_ = head;
+      node_ = nullptr;
+    }
+   private:
+    friend class HashLinkListRep;
+    const HashLinkListRep* const hash_link_list_rep_;
+    Node* head_;
+    Node* node_;
+    std::string tmp_;       // For passing to EncodeKey
+
+    virtual void SeekToHead() {
+      node_ = head_;
+    }
+  };
+
+  class DynamicIterator : public HashLinkListRep::Iterator {
+   public:
+    explicit DynamicIterator(HashLinkListRep& memtable_rep)
+      : HashLinkListRep::Iterator(&memtable_rep, nullptr),
+        memtable_rep_(memtable_rep) {}
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& k, const char* memtable_key) {
+      auto transformed = memtable_rep_.transform_->Transform(k);
+      Reset(memtable_rep_.GetBucket(transformed));
+      HashLinkListRep::Iterator::Seek(k, memtable_key);
+    }
+
+   private:
+    // the underlying memtable
+    const HashLinkListRep& memtable_rep_;
+  };
+
+  class EmptyIterator : public MemTableRep::Iterator {
+    // This is used when there wasn't a bucket. It is cheaper than
+    // instantiating an empty bucket over which to iterate.
+   public:
+    EmptyIterator() { }
+    virtual bool Valid() const {
+      return false;
+    }
+    virtual const char* key() const {
+      assert(false);
+      return nullptr;
+    }
+    virtual void Next() { }
+    virtual void Prev() { }
+    virtual void Seek(const Slice& user_key, const char* memtable_key) { }
+    virtual void SeekToFirst() { }
+    virtual void SeekToLast() { }
+   private:
+  };
+
+  std::shared_ptr<EmptyIterator> empty_iterator_;
+};
+
+HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare,
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size)
+  : bucket_size_(bucket_size),
+    transform_(transform),
+    compare_(compare),
+    arena_(arena),
+    empty_iterator_(std::make_shared<EmptyIterator>()) {
+
+  char* mem = arena_->AllocateAligned(
+      sizeof(port::AtomicPointer) * bucket_size);
+
+  buckets_ = new (mem) port::AtomicPointer[bucket_size];
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    buckets_[i].NoBarrier_Store(nullptr);
+  }
+}
+
+HashLinkListRep::~HashLinkListRep() {
+}
+
+void HashLinkListRep::Insert(const char* key) {
+  assert(!Contains(key));
+  auto transformed = transform_->Transform(UserKey(key));
+  auto& bucket = buckets_[GetHash(transformed)];
+  Node* head = static_cast<Node*>(bucket.Acquire_Load());
+
+  if (!head) {
+    Node* x = NewNode(key);
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(nullptr);
+    bucket.Release_Store(static_cast<void*>(x));
+    return;
+  }
+
+  Node* cur = head;
+  Node* prev = nullptr;
+  while (true) {
+    if (cur == nullptr) {
+      break;
+    }
+    Node* next = cur->Next();
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((cur == head) || (next == nullptr) ||
+           KeyIsAfterNode(next->key, cur));
+    if (KeyIsAfterNode(key, cur)) {
+      // Keep searching in this list
+      prev = cur;
+      cur = next;
+    } else {
+      break;
+    }
+  }
+
+  // Our data structure does not allow duplicate insertion
+  assert(cur == nullptr || !Equal(key, cur->key));
+
+  Node* x = NewNode(key);
+
+  // NoBarrier_SetNext() suffices since we will add a barrier when
+  // we publish a pointer to "x" in prev[i].
+  x->NoBarrier_SetNext(cur);
+
+  if (prev) {
+    prev->SetNext(x);
+  } else {
+    bucket.Release_Store(static_cast<void*>(x));
+  }
+}
+
+bool HashLinkListRep::Contains(const char* key) const {
+  auto transformed = transform_->Transform(UserKey(key));
+  auto bucket = GetBucket(transformed);
+  if (bucket == nullptr) {
+    return false;
+  }
+  return BucketContains(bucket, key);
+}
+
+size_t HashLinkListRep::ApproximateMemoryUsage() {
+  // Memory is always allocated from the arena.
+  return 0;
+}
+
+std::shared_ptr<MemTableRep::Iterator> HashLinkListRep::GetIterator() {
+  auto list = new FullList(compare_, arena_);
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    auto bucket = GetBucket(i);
+    if (bucket != nullptr) {
+      Iterator itr(this, bucket);
+      for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
+        list->Insert(itr.key());
+      }
+    }
+  }
+  return std::make_shared<FullListIterator>(list);
+}
+
+std::shared_ptr<MemTableRep::Iterator> HashLinkListRep::GetPrefixIterator(
+  const Slice& prefix) {
+  auto bucket = GetBucket(prefix);
+  if (bucket == nullptr) {
+    return empty_iterator_;
+  }
+  return std::make_shared<Iterator>(this, bucket);
+}
+
+std::shared_ptr<MemTableRep::Iterator> HashLinkListRep::GetIterator(
+    const Slice& slice) {
+  return GetPrefixIterator(transform_->Transform(slice));
+}
+
+std::shared_ptr<MemTableRep::Iterator>
+    HashLinkListRep::GetDynamicPrefixIterator() {
+  return std::make_shared<DynamicIterator>(*this);
+}
+
+bool HashLinkListRep::BucketContains(Node* head, const Key& key) const {
+  Node* x = FindGreaterOrEqualInBucket(head, key);
+  return (x != nullptr && Equal(key, x->key));
+}
+
+Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
+                                                  const Key& key) const {
+  Node* x = head;
+  while (true) {
+    if (x == nullptr) {
+      return x;
+    }
+    Node* next = x->Next();
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, x)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      break;
+    }
+  }
+  return x;
+}
+
+} // anon namespace
+
+std::shared_ptr<MemTableRep> HashLinkListRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return std::make_shared<HashLinkListRep>(compare, arena, transform_,
+                                           bucket_count_);
+}
+
+MemTableRepFactory* NewHashLinkListRepFactory(
+    const SliceTransform* transform, size_t bucket_count) {
+  return new HashLinkListRepFactory(transform, bucket_count);
+}
+
+} // namespace rocksdb
diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h
new file mode 100644
index 000000000..16d2517c2
--- /dev/null
+++ b/util/hash_linklist_rep.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+
+namespace rocksdb {
+
+class HashLinkListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashLinkListRepFactory(
+    const SliceTransform* transform,
+    size_t bucket_count)
+      : transform_(transform),
+        bucket_count_(bucket_count) { }
+
+  virtual ~HashLinkListRepFactory() { delete transform_; }
+
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+      MemTableRep::KeyComparator& compare, Arena* arena) override;
+
+  virtual const char* Name() const override {
+    return "HashLinkListRepFactory";
+  }
+
+  const SliceTransform* GetTransform() { return transform_; }
+
+ private:
+  const SliceTransform* transform_;
+  const size_t bucket_count_;
+};
+
+}

From 237a3da677e74cd294139284370272b7fbbc09c3 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 7 Jan 2014 14:33:15 -0800
Subject: [PATCH 47/70] StopWatch not to get time if it is created for
 statistics and it is disabled

Summary: Currently, even if statistics is not enabled, StopWatch only for the stats still gets the time of the day, which is wasteful. This patch adds a new option to StopWatch to disable this get in this case.

Test Plan: make all check

Reviewers: dhruba, haobo, igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14703

Conflicts:
	db/db_impl.cc
---
 db/db_impl.cc     | 10 +++++-----
 util/stop_watch.h |  5 +++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 86a81951f..8bcdbf4ae 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2080,11 +2080,11 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   if (s.ok() && !options_.disableDataSync) {
     if (options_.use_fsync) {
       StopWatch sw(env_, options_.statistics.get(),
-                   COMPACTION_OUTFILE_SYNC_MICROS);
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
       s = compact->outfile->Fsync();
     } else {
       StopWatch sw(env_, options_.statistics.get(),
-                   COMPACTION_OUTFILE_SYNC_MICROS);
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
       s = compact->outfile->Sync();
     }
   }
@@ -2717,7 +2717,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
                        bool* value_found) {
   Status s;
 
-  StopWatch sw(env_, options_.statistics.get(), DB_GET);
+  StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
   StopWatchNano snapshot_timer(env_, false);
   StartPerfTimer(&snapshot_timer);
   SequenceNumber snapshot;
@@ -2798,7 +2798,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
 std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
                                      const std::vector<Slice>& keys,
                                      std::vector<std::string>* values) {
-  StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET);
+  StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
   StopWatchNano snapshot_timer(env_, false);
   StartPerfTimer(&snapshot_timer);
 
@@ -2958,7 +2958,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   w.disableWAL = options.disableWAL;
   w.done = false;
 
-  StopWatch sw(env_, options_.statistics.get(), DB_WRITE);
+  StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false);
   mutex_.Lock();
   writers_.push_back(&w);
   while (!w.done && &w != writers_.front()) {
diff --git a/util/stop_watch.h b/util/stop_watch.h
index e36bcb7ec..6325a7440 100644
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -15,9 +15,10 @@ class StopWatch {
   explicit StopWatch(
     Env * const env,
     Statistics* statistics = nullptr,
-    const Histograms histogram_name = DB_GET) :
+    const Histograms histogram_name = DB_GET,
+    bool auto_start = true) :
       env_(env),
-      start_time_(env->NowMicros()),
+      start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()),
       statistics_(statistics),
       histogram_name_(histogram_name) {}
 

From 5b5ab0c1a8fc78746d9fd6334d927139f40c3deb Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 9 Jan 2014 22:25:03 -0800
Subject: [PATCH 48/70] [Performance Branch] Fix memory leak in
 HashLinkListRep.GetIterator()

Summary: Full list constructed for full iterator can be leaked. This was a bug introduced when I copy the full iterator codes from hash skip list to hash link list. This patch fixes it.

Test Plan: Run valgrind test against db_test and make sure the memory leak is fixed

Reviewers: kailiu, haobo

Reviewed By: kailiu

CC: igor, leveldb

Differential Revision: https://reviews.facebook.net/D15093
---
 db/prefix_test.cc         | 5 +++--
 util/hash_linklist_rep.cc | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 66cef92cb..ca00c31b3 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -108,7 +108,6 @@ class PrefixTest {
     options.min_write_buffer_number_to_merge =
       FLAGS_min_write_buffer_number_to_merge;
 
-    options.comparator = new TestKeyComparator();
     options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
     options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
 
@@ -142,7 +141,9 @@ class PrefixTest {
     return false;
   }
 
-  PrefixTest() : option_config_(kBegin) { }
+  PrefixTest() : option_config_(kBegin) {
+    options.comparator = new TestKeyComparator();
+  }
   ~PrefixTest() {
     delete options.comparator;
   }
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index e53bffbb6..214ac2c55 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -125,7 +125,7 @@ class HashLinkListRep : public MemTableRep {
   class FullListIterator : public MemTableRep::Iterator {
    public:
     explicit FullListIterator(FullList* list)
-      : iter_(list) {}
+      : iter_(list), full_list_(list) {}
 
     virtual ~FullListIterator() {
     }
@@ -177,6 +177,8 @@ class HashLinkListRep : public MemTableRep {
     }
    private:
     FullList::Iterator iter_;
+    // To destruct with the iterator.
+    std::unique_ptr<FullList> full_list_;
     std::string tmp_;       // For passing to EncodeKey
   };
 

From aa0ef6602d9f71d1a66d4ae0a3e68a59c2d9b12a Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 6 Jan 2014 20:29:17 -0800
Subject: [PATCH 49/70] [Performance Branch] If options.max_open_files set to
 be -1, cache table readers in FileMetadata for Get() and NewIterator()

Summary:
In some use cases, table readers for all live files should always be cached. In that case, there will be an opportunity to avoid the table cache look-up while Get() and NewIterator().

We define options.max_open_files = -1 to be the mode that table readers for live files will always be kept. In that mode, table readers are cached in FileMetaData (with a reference count hold in table cache). So that when executing table_cache.Get() and table_cache.newInterator(), LRU cache checking can be by-passed, to reduce latency.

Test Plan: add a test case in db_test

Reviewers: haobo, kailiu

Reviewed By: haobo

CC: dhruba, igor, leveldb

Differential Revision: https://reviews.facebook.net/D15039
---
 db/builder.cc             |  3 +--
 db/db_impl.cc             | 21 ++++++++++-----
 db/db_test.cc             |  4 +++
 db/repair.cc              |  3 ++-
 db/table_cache.cc         | 54 ++++++++++++++++++++++---------------
 db/table_cache.h          | 22 +++++++++------
 db/version_edit.h         | 11 ++++++--
 db/version_set.cc         | 56 ++++++++++++++++++++++++++++-----------
 db/version_set.h          |  2 +-
 include/rocksdb/options.h |  6 +++--
 10 files changed, 124 insertions(+), 58 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index ad1334a15..53d2f8985 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -204,8 +204,7 @@ Status BuildTable(const std::string& dbname,
       // Verify that the table is usable
       Iterator* it = table_cache->NewIterator(ReadOptions(),
                                               soptions,
-                                              meta->number,
-                                              meta->file_size);
+                                              *meta);
       s = it->status();
       delete it;
     }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 8bcdbf4ae..297ac0ade 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -126,7 +126,10 @@ Options SanitizeOptions(const std::string& dbname,
   Options result = src;
   result.comparator = icmp;
   result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
-  ClipToRange(&result.max_open_files,            20,     1000000);
+  // result.max_open_files means an "infinite" open files.
+  if (result.max_open_files != -1) {
+    ClipToRange(&result.max_open_files,            20,     1000000);
+  }
   ClipToRange(&result.write_buffer_size,         ((size_t)64)<<10,
                                                  ((size_t)64)<<30);
   ClipToRange(&result.block_size,                1<<10,  4<<20);
@@ -278,7 +281,10 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
   }
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
-  const int table_cache_size = options_.max_open_files - 10;
+  // Give a large number for setting of "infinite" open files.
+  const int table_cache_size =
+      (options_.max_open_files == -1) ?
+          4194304 : options_.max_open_files - 10;
   table_cache_.reset(new TableCache(dbname_, &options_,
                                     storage_options_, table_cache_size));
   versions_.reset(new VersionSet(dbname_, &options_, storage_options_,
@@ -335,6 +341,9 @@ DBImpl::~DBImpl() {
   for (MemTable* m: to_delete) {
     delete m;
   }
+  // versions need to be destroyed before table_cache since it can holds
+  // references to table_cache.
+  versions_.reset();
   LogFlush(options_.info_log);
 }
 
@@ -2095,10 +2104,10 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
 
   if (s.ok() && current_entries > 0) {
     // Verify that the table is usable
+    FileMetaData meta(output_number, current_bytes);
     Iterator* iter = table_cache_->NewIterator(ReadOptions(),
                                                storage_options_,
-                                               output_number,
-                                               current_bytes);
+                                               meta);
     s = iter->status();
     delete iter;
     if (s.ok()) {
@@ -3701,7 +3710,7 @@ Status DBImpl::DeleteFile(std::string name) {
   }
 
   int level;
-  FileMetaData metadata;
+  FileMetaData* metadata;
   int maxlevel = NumberLevels();
   VersionEdit edit(maxlevel);
   DeletionState deletion_state(true);
@@ -3716,7 +3725,7 @@ Status DBImpl::DeleteFile(std::string name) {
     assert((level > 0) && (level < maxlevel));
 
     // If the file is being compacted no need to delete.
-    if (metadata.being_compacted) {
+    if (metadata->being_compacted) {
       Log(options_.info_log,
           "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
       return Status::OK();
diff --git a/db/db_test.cc b/db/db_test.cc
index 8e2bc9f27..838492f1a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -265,6 +265,7 @@ class DBTest {
     kHashSkipList,
     kUniversalCompaction,
     kCompressedBlockCache,
+    kInfiniteMaxOpenFiles,
     kEnd
   };
   int option_config_;
@@ -415,6 +416,9 @@ class DBTest {
       case kCompressedBlockCache:
         options.block_cache_compressed = NewLRUCache(8*1024*1024);
         break;
+      case kInfiniteMaxOpenFiles:
+        options.max_open_files = -1;
+        break;
       default:
         break;
     }
diff --git a/db/repair.cc b/db/repair.cc
index fc9ba282d..802c04fc4 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -265,8 +265,9 @@ class Repairer {
     int counter = 0;
     Status status = env_->GetFileSize(fname, &t->meta.file_size);
     if (status.ok()) {
+      FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
       Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
+          ReadOptions(), storage_options_, dummy_meta);
       bool empty = true;
       ParsedInternalKey parsed;
       t->min_sequence = 0;
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 593352dde..527a10cba 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -10,6 +10,7 @@
 #include "db/table_cache.h"
 
 #include "db/filename.h"
+#include "db/version_edit.h"
 
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
@@ -50,6 +51,14 @@ TableCache::TableCache(const std::string& dbname,
 TableCache::~TableCache() {
 }
 
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+  return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+  cache_->Release(handle);
+}
+
 Status TableCache::FindTable(const EnvOptions& toptions,
                              uint64_t file_number, uint64_t file_size,
                              Cache::Handle** handle, bool* table_io,
@@ -94,25 +103,27 @@ Status TableCache::FindTable(const EnvOptions& toptions,
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                   const EnvOptions& toptions,
-                                  uint64_t file_number,
-                                  uint64_t file_size,
+                                  const FileMetaData& file_meta,
                                   TableReader** table_reader_ptr,
                                   bool for_compaction) {
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = nullptr;
   }
-
-  Cache::Handle* handle = nullptr;
-  Status s = FindTable(toptions, file_number, file_size, &handle,
-                       nullptr, options.read_tier == kBlockCacheTier);
+  Cache::Handle* handle = file_meta.table_reader_handle;
+  Status s;
+  if (!handle) {
+    s = FindTable(toptions, file_meta.number, file_meta.file_size, &handle,
+                  nullptr, options.read_tier == kBlockCacheTier);
+  }
   if (!s.ok()) {
     return NewErrorIterator(s);
   }
 
-  TableReader* table_reader =
-    reinterpret_cast<TableReader*>(cache_->Value(handle));
+  TableReader* table_reader = GetTableReaderFromHandle(handle);
   Iterator* result = table_reader->NewIterator(options);
-  result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
+  if (!file_meta.table_reader_handle) {
+    result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
+  }
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = table_reader;
   }
@@ -125,22 +136,24 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 }
 
 Status TableCache::Get(const ReadOptions& options,
-                       uint64_t file_number,
-                       uint64_t file_size,
+                       const FileMetaData& file_meta,
                        const Slice& k,
                        void* arg,
                        bool (*saver)(void*, const Slice&, const Slice&, bool),
                        bool* table_io,
                        void (*mark_key_may_exist)(void*)) {
-  Cache::Handle* handle = nullptr;
-  Status s = FindTable(storage_options_, file_number, file_size,
-                       &handle, table_io,
-                       options.read_tier == kBlockCacheTier);
+  Cache::Handle* handle = file_meta.table_reader_handle;
+  Status s;
+  if (!handle) {
+    s = FindTable(storage_options_, file_meta.number, file_meta.file_size,
+                  &handle, table_io, options.read_tier == kBlockCacheTier);
+  }
   if (s.ok()) {
-    TableReader* t =
-      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    TableReader* t = GetTableReaderFromHandle(handle);
     s = t->Get(options, k, arg, saver, mark_key_may_exist);
-    cache_->Release(handle);
+    if (!file_meta.table_reader_handle) {
+      ReleaseHandle(handle);
+    }
   } else if (options.read_tier && s.IsIncomplete()) {
     // Couldnt find Table in cache but treat as kFound if no_io set
     (*mark_key_may_exist)(arg);
@@ -159,10 +172,9 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
                        file_size, &handle, table_io);
   bool may_match = true;
   if (s.ok()) {
-    TableReader* t =
-      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    TableReader* t = GetTableReaderFromHandle(handle);
     may_match = t->PrefixMayMatch(internal_prefix);
-    cache_->Release(handle);
+    ReleaseHandle(handle);
   }
   return may_match;
 }
diff --git a/db/table_cache.h b/db/table_cache.h
index f65326bad..ba50ae4d5 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -21,6 +21,7 @@
 namespace rocksdb {
 
 class Env;
+struct FileMetaData;
 
 class TableCache {
  public:
@@ -37,8 +38,7 @@ class TableCache {
   // returned iterator is live.
   Iterator* NewIterator(const ReadOptions& options,
                         const EnvOptions& toptions,
-                        uint64_t file_number,
-                        uint64_t file_size,
+                        const FileMetaData& file_meta,
                         TableReader** table_reader_ptr = nullptr,
                         bool for_compaction = false);
 
@@ -46,8 +46,7 @@ class TableCache {
   // call (*handle_result)(arg, found_key, found_value) repeatedly until
   // it returns false.
   Status Get(const ReadOptions& options,
-             uint64_t file_number,
-             uint64_t file_size,
+             const FileMetaData& file_meta,
              const Slice& k,
              void* arg,
              bool (*handle_result)(void*, const Slice&, const Slice&, bool),
@@ -63,16 +62,23 @@ class TableCache {
   // Evict any entry for the specified file number
   void Evict(uint64_t file_number);
 
+  // Find table reader
+  Status FindTable(const EnvOptions& toptions, uint64_t file_number,
+                   uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
+                   const bool no_io = false);
+
+  // Get TableReader from a cache handle.
+  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+  // Release the handle from a cache
+  void ReleaseHandle(Cache::Handle* handle);
+
  private:
   Env* const env_;
   const std::string dbname_;
   const Options* options_;
   const EnvOptions& storage_options_;
   std::shared_ptr<Cache> cache_;
-
-  Status FindTable(const EnvOptions& toptions, uint64_t file_number,
-                   uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
-                   const bool no_io = false);
 };
 
 }  // namespace rocksdb
diff --git a/db/version_edit.h b/db/version_edit.h
index d6fac1c3c..7751ad92e 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -11,6 +11,7 @@
 #include <set>
 #include <utility>
 #include <vector>
+#include "rocksdb/cache.h"
 #include "db/dbformat.h"
 
 namespace rocksdb {
@@ -28,8 +29,14 @@ struct FileMetaData {
   SequenceNumber smallest_seqno;// The smallest seqno in this file
   SequenceNumber largest_seqno; // The largest seqno in this file
 
-  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
-                   being_compacted(false) { }
+  // Needs to be disposed when refs becomes 0.
+  Cache::Handle* table_reader_handle;
+
+  FileMetaData(uint64_t number, uint64_t file_size) :
+      refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size),
+      being_compacted(false), table_reader_handle(nullptr) {
+  }
+  FileMetaData() : FileMetaData(0, 0) { }
 };
 
 class VersionEdit {
diff --git a/db/version_set.cc b/db/version_set.cc
index e2421ef92..eb58e2780 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -51,6 +51,10 @@ Version::~Version() {
       assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
+        if (f->table_reader_handle) {
+          vset_->table_cache_->ReleaseHandle(f->table_reader_handle);
+          f->table_reader_handle = nullptr;
+        }
         vset_->obsolete_files_.push_back(f);
       }
     }
@@ -202,10 +206,11 @@ static Iterator* GetFileIterator(void* arg,
       options_copy = options;
       options_copy.prefix = nullptr;
     }
+    FileMetaData meta(DecodeFixed64(file_value.data()),
+                      DecodeFixed64(file_value.data() + 8));
     return cache->NewIterator(options.prefix ? options_copy : options,
                               soptions,
-                              DecodeFixed64(file_value.data()),
-                              DecodeFixed64(file_value.data() + 8),
+                              meta,
                               nullptr /* don't need reference to table*/,
                               for_compaction);
   }
@@ -257,9 +262,8 @@ void Version::AddIterators(const ReadOptions& options,
                            std::vector<Iterator*>* iters) {
   // Merge all level zero files together since they may overlap
   for (const FileMetaData* file : files_[0]) {
-    iters->push_back(
-        vset_->table_cache_->NewIterator(
-            options, soptions, file->number, file->file_size));
+    iters->push_back(vset_->table_cache_->NewIterator(options, soptions,
+                                                      *file));
   }
 
   // For levels > 0, we can use a concatenating iterator that sequentially
@@ -513,9 +517,8 @@ void Version::Get(const ReadOptions& options,
       prev_file = f;
 #endif
       bool tableIO = false;
-      *status = vset_->table_cache_->Get(options, f->number, f->file_size,
-                                         ikey, &saver, SaveValue, &tableIO,
-                                         MarkKeyMayExist);
+      *status = vset_->table_cache_->Get(options, *f, ikey, &saver, SaveValue,
+                                         &tableIO, MarkKeyMayExist);
       // TODO: examine the behavior for corrupted key
       if (!status->ok()) {
         return;
@@ -954,6 +957,11 @@ class VersionSet::Builder {
         FileMetaData* f = to_unref[i];
         f->refs--;
         if (f->refs <= 0) {
+          if (f->table_reader_handle) {
+            vset_->table_cache_->ReleaseHandle(
+                f->table_reader_handle);
+            f->table_reader_handle = nullptr;
+          }
           delete f;
         }
       }
@@ -1113,6 +1121,20 @@ class VersionSet::Builder {
     CheckConsistency(v);
   }
 
+  void LoadTableHandlers() {
+    for (int level = 0; level < vset_->NumberLevels(); level++) {
+      for (auto& file_meta : *(levels_[level].added_files)) {
+        assert (!file_meta->table_reader_handle);
+        bool table_io;
+        vset_->table_cache_->FindTable(vset_->storage_options_,
+                                       file_meta->number,
+                                       file_meta->file_size,
+                                       &file_meta->table_reader_handle,
+                                       &table_io, false);
+      }
+    }
+  }
+
   void MaybeAddFile(Version* v, int level, FileMetaData* f) {
     if (levels_[level].deleted_files.count(f->number) > 0) {
       // File is deleted: do nothing
@@ -1258,7 +1280,7 @@ Status VersionSet::LogAndApply(
     edit->SetNextFile(next_file_number_);
   }
 
-  // Unlock during expensive MANIFEST log write. New writes cannot get here
+  // Unlock during expensive operations. New writes cannot get here
   // because &w is ensuring that all new writes get queued.
   {
     // calculate the amount of data being compacted at every level
@@ -1267,6 +1289,12 @@ Status VersionSet::LogAndApply(
 
     mu->Unlock();
 
+    if (options_->max_open_files == -1) {
+      // unlimited table cache. Pre-load table handle now.
+      // Need to do it out of the mutex.
+      builder.LoadTableHandlers();
+    }
+
     // This is fine because everything inside of this block is serialized --
     // only one thread can be here at the same time
     if (!new_manifest_filename.empty()) {
@@ -1966,8 +1994,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         // approximate offset of "ikey" within the table.
         TableReader* table_reader_ptr;
         Iterator* iter = table_cache_->NewIterator(
-            ReadOptions(), storage_options_, files[i]->number,
-            files[i]->file_size, &table_reader_ptr);
+            ReadOptions(), storage_options_, *(files[i]), &table_reader_ptr);
         if (table_reader_ptr != nullptr) {
           result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
         }
@@ -2092,8 +2119,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
         for (size_t i = 0; i < files.size(); i++) {
           list[num++] = table_cache_->NewIterator(
               options, storage_options_compactions_,
-              files[i]->number, files[i]->file_size, nullptr,
-              true /* for compaction */);
+              *(files[i]), nullptr, true /* for compaction */);
         }
       } else {
         // Create concatenating iterator for the files from this level
@@ -2876,12 +2902,12 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
 Status VersionSet::GetMetadataForFile(
     uint64_t number,
     int *filelevel,
-    FileMetaData *meta) {
+    FileMetaData **meta) {
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       if (files[i]->number == number) {
-        *meta = *files[i];
+        *meta = files[i];
         *filelevel = level;
         return Status::OK();
       }
diff --git a/db/version_set.h b/db/version_set.h
index 3f8f95585..d5dc2cb13 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -431,7 +431,7 @@ class VersionSet {
   double MaxBytesForLevel(int level);
 
   Status GetMetadataForFile(
-    uint64_t number, int *filelevel, FileMetaData *metadata);
+    uint64_t number, int *filelevel, FileMetaData **metadata);
 
   void GetLiveFilesMetaData(
     std::vector<LiveFileMetaData> *metadata);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 5041ea593..660d36838 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -182,8 +182,10 @@ struct Options {
   int min_write_buffer_number_to_merge;
 
   // Number of open files that can be used by the DB.  You may need to
-  // increase this if your database has a large working set (budget
-  // one open file per 2MB of working set).
+  // increase this if your database has a large working set. Value -1 means
+  // files opened are always kept open. You can estimate number of files based
+  // on target_file_size_base and target_file_size_multiplier for level-based
+  // compaction. For universal-style compaction, you can usually set it to -1.
   //
   // Default: 1000
   int max_open_files;

From 8454cfe5692c041509efa5b4cad36dcddff1a4b0 Mon Sep 17 00:00:00 2001
From: Naman Gupta <nkgupta@fb.com>
Date: Tue, 14 Jan 2014 07:55:16 -0800
Subject: [PATCH 50/70] Add read/modify/write functionality to Put() api

Summary: The application can set a callback function, which is applied on the previous value. And calculates the new value. This new value can be set, either inplace, if the previous value existed in memtable, and new value is smaller than previous value. Otherwise the new value is added normally.

Test Plan: fbmake. Added unit tests. All unit tests pass.

Reviewers: dhruba, haobo

Reviewed By: haobo

CC: sdong, kailiu, xinyaohu, sumeet, leveldb

Differential Revision: https://reviews.facebook.net/D14745
---
 db/db_test.cc             | 150 +++++++++++++++++++++++++++++---------
 db/memtable.cc            | 113 +++++++++++++++++++++++-----
 db/memtable.h             |  31 ++++++--
 db/write_batch.cc         |  38 +++++++++-
 include/rocksdb/options.h |  46 ++++++++++--
 util/options.cc           |   1 +
 6 files changed, 309 insertions(+), 70 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 838492f1a..ad55ecb1b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -555,7 +555,7 @@ class DBTest {
             case kTypeDeletion:
               result += "DEL";
               break;
-            case kTypeLogData:
+            default:
               assert(false);
               break;
           }
@@ -705,6 +705,44 @@ class DBTest {
     ASSERT_EQ(IterStatus(iter), expected_key);
     delete iter;
   }
+
+
+  // Used to test InplaceUpdate
+
+  // If previous value is nullptr or delta is > than previous value,
+  //   sets newValue with delta
+  // If previous value is not empty,
+  //   updates previous value with 'b' string of previous value size
+  static bool updateInPlace(char* prevValue, size_t prevSize,
+                            Slice delta, std::string* newValue) {
+    if (prevValue == nullptr || delta.size() > prevSize) {
+      *newValue = std::string(delta.size(), 'c');
+      return false;
+    } else {
+      std::string str_b = std::string(prevSize, 'b');
+      memcpy(prevValue, str_b.c_str(), str_b.size());
+      return true;
+    }
+  }
+
+  // Used to test InplaceUpdate
+  void validateNumberOfEntries(int numValues) {
+      Iterator* iter = dbfull()->TEST_NewInternalIterator();
+      iter->SeekToFirst();
+      ASSERT_EQ(iter->status().ok(), true);
+      int seq = numValues;
+      while (iter->Valid()) {
+        ParsedInternalKey ikey;
+        ikey.sequence = -1;
+        ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+
+        // checks sequence number for updates
+        ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+        iter->Next();
+      }
+      delete iter;
+      ASSERT_EQ(0, seq);
+  }
 };
 std::unique_ptr<const SliceTransform> DBTest::prefix_1_transform(
     NewFixedPrefixTransform(1));
@@ -2391,9 +2429,9 @@ TEST(DBTest, InPlaceUpdate) {
     options.inplace_update_support = true;
     options.env = env_;
     options.write_buffer_size = 100000;
+    Reopen(&options);
 
     // Update key with values of smaller size
-    Reopen(&options);
     int numValues = 10;
     for (int i = numValues; i > 0; i--) {
       std::string value = DummyString(i, 'a');
@@ -2401,50 +2439,92 @@ TEST(DBTest, InPlaceUpdate) {
       ASSERT_EQ(value, Get("key"));
     }
 
-    int count = 0;
-    Iterator* iter = dbfull()->TEST_NewInternalIterator();
-    iter->SeekToFirst();
-    ASSERT_EQ(iter->status().ok(), true);
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      count++;
-      // All updates with the same sequence number.
-      ASSERT_EQ(ikey.sequence, (unsigned)1);
-      iter->Next();
-    }
     // Only 1 instance for that key.
-    ASSERT_EQ(count, 1);
-    delete iter;
+    validateNumberOfEntries(1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    Reopen(&options);
 
     // Update key with values of larger size
-    DestroyAndReopen(&options);
-    numValues = 10;
+    int numValues = 10;
     for (int i = 0; i < numValues; i++) {
       std::string value = DummyString(i, 'a');
       ASSERT_OK(Put("key", value));
       ASSERT_EQ(value, Get("key"));
     }
 
-    count = 0;
-    iter = dbfull()->TEST_NewInternalIterator();
-    iter->SeekToFirst();
-    ASSERT_EQ(iter->status().ok(), true);
-    int seq = numValues;
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      count++;
-      // No inplace updates. All updates are puts with new seq number
-      ASSERT_EQ(ikey.sequence, (unsigned)seq--);
-      iter->Next();
-    }
     // All 10 updates exist in the internal iterator
-    ASSERT_EQ(count, numValues);
-    delete iter;
+    validateNumberOfEntries(numValues);
 
+  } while (ChangeCompactOptions());
+}
+
+
+TEST(DBTest, InPlaceUpdateCallback) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlace;
+    Reopen(&options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    ASSERT_OK(Put("key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get("key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put("key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(numValues, 'b'), Get("key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackNotFound) {
+  do {
+    //Test sst get/update/put
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlace;
+    Reopen(&options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 1; i <= numValues; i++) {
+      ASSERT_OK(Put("key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i, 'c'), Get("key"));
+    }
+
+    // No inplace updates. All updates are puts with new seq number
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues);
 
   } while (ChangeCompactOptions());
 }
diff --git a/db/memtable.cc b/db/memtable.cc
index 55549a142..c556412ea 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -302,7 +302,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           }
           break;
         }
-        case kTypeLogData:
+        default:
           assert(false);
           break;
       }
@@ -322,7 +322,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   return found_final_value;
 }
 
-bool MemTable::Update(SequenceNumber seq, ValueType type,
+void MemTable::Update(SequenceNumber seq,
                       const Slice& key,
                       const Slice& value) {
   LookupKey lkey(key, seq);
@@ -335,7 +335,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
 
   if (iter->Valid()) {
     // entry format is:
-    //    klength  varint32
+    //    key_length  varint32
     //    userkey  char[klength-8]
     //    tag      uint64
     //    vlength  varint32
@@ -352,37 +352,114 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
       const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
-          uint32_t vlength;
-          GetVarint32Ptr(key_ptr + key_length,
-                         key_ptr + key_length+5, &vlength);
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t prev_value_size = prev_value.size();
+          uint32_t new_value_size = value.size();
+
           // Update value, if newValue size  <= curValue size
-          if (value.size() <= vlength) {
+          if (new_value_size <= prev_value_size ) {
             char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
-                                     value.size());
+                                     new_value_size);
             WriteLock wl(GetLock(lkey.user_key()));
-            memcpy(p, value.data(), value.size());
+            memcpy(p, value.data(), new_value_size);
             assert(
-              (p + value.size()) - entry ==
+              (p + new_value_size) - entry ==
               (unsigned) (VarintLength(key_length) +
                           key_length +
-                          VarintLength(value.size()) +
-                          value.size())
+                          VarintLength(new_value_size) +
+                          new_value_size)
             );
             // no need to update bloom, as user key does not change.
-            return true;
+            return;
           }
         }
         default:
           // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
-          // then we probably don't have enough space to update in-place
-          // Maybe do something later
-          // Return false, and do normal Add()
-          return false;
+          // we don't have enough space for update inplace
+            Add(seq, kTypeValue, key, value);
+            return;
       }
     }
   }
 
-  // Key doesn't exist
+  // key doesn't exist
+  Add(seq, kTypeValue, key, value);
+}
+
+bool MemTable::UpdateCallback(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& delta,
+                      const Options& options) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(key, memkey.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t prev_value_size = prev_value.size();
+
+          WriteLock wl(GetLock(lkey.user_key()));
+          std::string str_value;
+          if (options.inplace_callback(const_cast<char*>(prev_value.data()),
+                                       prev_value_size, delta, &str_value)) {
+            // Value already updated by callback.
+            // TODO: Change size of value in memtable slice.
+            //   This works for leaf, since size is already encoded in the
+            //   value. It doesn't depend on rocksdb buffer size.
+            return true;
+          }
+          Slice slice_value = Slice(str_value);
+          uint32_t new_value_size = slice_value.size();
+
+          // Update value, if newValue size  <= curValue size
+          if (new_value_size <= prev_value_size ) {
+            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                     new_value_size);
+
+            memcpy(p, slice_value.data(), new_value_size);
+            assert(
+              (p + new_value_size) - entry ==
+              (unsigned) (VarintLength(key_length) +
+                          key_length +
+                          VarintLength(new_value_size) +
+                          new_value_size)
+            );
+            return true;
+          } else {
+            // If we don't have enough space to update in-place
+            // Return as NotUpdatable, and do normal Add()
+            Add(seq, kTypeValue, key, slice_value);
+            return true;
+          }
+        }
+        default:
+          break;
+      }
+    }
+  }
+  // If the latest value is not kTypeValue
+  // or key doesn't exist
   return false;
 }
 }  // namespace rocksdb
diff --git a/db/memtable.h b/db/memtable.h
index 946c99bf2..c94bf0b0b 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -98,16 +98,31 @@ class MemTable {
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext& merge_context, const Options& options);
 
-  // Update the value and return status ok,
-  //   if key exists in current memtable
-  //     if new sizeof(new_value) <= sizeof(old_value) &&
-  //       old_value for that key is a put i.e. kTypeValue
-  //     else return false, and status - NotUpdatable()
-  //   else return false, and status - NotFound()
-  bool Update(SequenceNumber seq, ValueType type,
+  // Attempts to update the new_value inplace, else does normal Add
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     if new sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else add(key, new_value)
+  void Update(SequenceNumber seq,
               const Slice& key,
               const Slice& value);
 
+  // If prev_value for key exits, attempts to update it inplace.
+  // else returns false
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     new_value = delta(prev_value)
+  //     if sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else return false
+  bool UpdateCallback(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& delta,
+                      const Options& options);
+
   // Returns the edits area that is needed for flushing the memtable
   VersionEdit* GetEdits() { return &edit_; }
 
@@ -149,7 +164,7 @@ class MemTable {
   bool flush_completed_;   // finished the flush
   uint64_t file_number_;    // filled up after flush is complete
 
-  // The udpates to be applied to the transaction log when this
+  // The updates to be applied to the transaction log when this
   // memtable is flushed to storage.
   VersionEdit edit_;
 
diff --git a/db/write_batch.cc b/db/write_batch.cc
index c04930bbf..76e4381a5 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -115,7 +115,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         return Status::Corruption("unknown WriteBatch tag");
     }
   }
-  if (found != WriteBatchInternal::Count(this)) {
+ if (found != WriteBatchInternal::Count(this)) {
     return Status::Corruption("WriteBatch has wrong count");
   } else {
     return Status::OK();
@@ -194,14 +194,44 @@ class MemTableInserter : public WriteBatch::Handler {
   }
 
   virtual void Put(const Slice& key, const Slice& value) {
-    if (options_->inplace_update_support
-        && mem_->Update(sequence_, kTypeValue, key, value)) {
+    if (!options_->inplace_update_support) {
+      mem_->Add(sequence_, kTypeValue, key, value);
+    } else if (options_->inplace_callback == nullptr) {
+      mem_->Update(sequence_, key, value);
       RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED);
     } else {
-      mem_->Add(sequence_, kTypeValue, key, value);
+      if (mem_->UpdateCallback(sequence_, key, value, *options_)) {
+        RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED);
+      } else {
+        // key not found in memtable. Do sst get/update/add
+        SnapshotImpl read_from_snapshot;
+        read_from_snapshot.number_ = sequence_;
+        ReadOptions ropts;
+        ropts.snapshot = &read_from_snapshot;
+
+        std::string prev_value;
+        std::string merged_value;
+        Status s = db_->Get(ropts, key, &prev_value);
+        char* prev_buffer = const_cast<char*>(prev_value.c_str());
+        size_t prev_size = prev_value.size();
+        if (options_->inplace_callback(s.ok() ? prev_buffer: nullptr,
+                                       s.ok() ? prev_size: 0,
+                                       value, &merged_value)) {
+          // prev_value is updated in-place with final value.
+          mem_->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
+          RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
+        } else {
+          // merged_value contains the final value. Only add if not empty.
+          if (!merged_value.empty()) {
+            mem_->Add(sequence_, kTypeValue, key, Slice(merged_value));
+            RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
+          }
+        }
+      }
     }
     sequence_++;
   }
+
   virtual void Merge(const Slice& key, const Slice& value) {
     mem_->Add(sequence_, kTypeMerge, key, value);
     sequence_++;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 660d36838..c742d932d 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -627,10 +627,13 @@ struct Options {
           TablePropertiesCollectors;
   TablePropertiesCollectors table_properties_collectors;
 
-  // Allows thread-safe inplace updates. Requires Updates iff
-  // * key exists in current memtable
-  // * new sizeof(new_value) <= sizeof(old_value)
-  // * old_value for that key is a put i.e. kTypeValue
+  // Allows thread-safe inplace updates.
+  // If inplace_callback function is not set,
+  //   Put(key, new_value) will update inplace the existing_value iff
+  //   * key exists in current memtable
+  //   * new sizeof(new_value) <= sizeof(existing_value)
+  //   * existing_value for that key is a put i.e. kTypeValue
+  // If inplace_callback function is set, check doc for inplace_callback.
   // Default: false.
   bool inplace_update_support;
 
@@ -638,13 +641,46 @@ struct Options {
   // Default: 10000, if inplace_update_support = true, else 0.
   size_t inplace_update_num_locks;
 
+
+  // * existing_value - pointer to previous value (from both memtable and sst).
+  //                    nullptr if key doesn't exist
+  // * existing_value_size - sizeof(existing_value). 0 if key doesn't exist
+  // * delta_value - Delta value to be merged with the 'existing_value'.
+  //                 Stored in transaction logs.
+  // * merged_value - Set when delta is applied on the previous value.
+
+  // Applicable only when inplace_update_support is true,
+  // this callback function is called at the time of updating the memtable
+  // as part of a Put operation, lets say Put(key, delta_value). It allows the
+  // 'delta_value' specified as part of the Put operation to be merged with
+  // an 'existing_value' of the 'key' in the database.
+
+  // If the merged value is smaller in size that the 'existing_value',
+  // then this function can update the 'existing_value' buffer inplace if it
+  // wishes to. The callback should return true in this case. (In this case,
+  // the snapshot-semantics of the rocksdb Iterator is not atomic anymore).
+
+  // If the application does not wish to modify the 'existing_value' buffer
+  // inplace, then it should allocate a new buffer and update it by merging the
+  // 'existing_value' and the Put 'delta_value' and set the 'merged_value'
+  // pointer to this buffer. The callback should return false in this case. It
+  // is upto the calling layer to manage the memory returned in 'merged_value'.
+
+  // Please remember that the original call from the application is Put(key,
+  // delta_value). So the transaction log (if enabled) will still contain
+  // (key, delta_value). The 'merged_value' is not stored in the transaction log
+  // Hence the inplace_callback function should be consistent across db reopens.
+
+  // Default: nullptr
+  bool (*inplace_callback)(char* existing_value, size_t existing_value_size,
+                           Slice delta_value, std::string* merged_value);
+
   // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
   // for memtable
   uint32_t memtable_prefix_bloom_bits;
 
   // number of hash probes per key
   uint32_t memtable_prefix_bloom_probes;
-
 };
 
 //
diff --git a/util/options.cc b/util/options.cc
index c89d45bb0..93aa268f1 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -102,6 +102,7 @@ Options::Options()
         std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
       inplace_update_support(false),
       inplace_update_num_locks(10000),
+      inplace_callback(nullptr),
       memtable_prefix_bloom_bits(0),
       memtable_prefix_bloom_probes(6) {
   assert(memtable_factory.get() != nullptr);

From 9ea8bf90f117ee9f9ae1dc11d9eaeb9ab768841b Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 14 Jan 2014 10:42:36 -0800
Subject: [PATCH 51/70] DB::Put() to estimate write batch data size needed and
 pre-allocate buffer

Summary:
In one of CPU profiles, we see some CPU costs of string::reserve() inside Batch.Put(). This patch should be able to reduce some of the costs by allocating sufficient buffer before hand.

Since it is a trivial percentage of CPU costs, I didn't find a way to show the improvement in one of the benchmarks. I'll deploy it to same application and do the same CPU profiling to make sure those CPU costs are reduced.

Test Plan: make all check

Reviewers: haobo, kailiu, igor

Reviewed By: haobo

CC: leveldb, nkg-

Differential Revision: https://reviews.facebook.net/D15135
---
 db/db_impl.cc                 | 5 ++++-
 db/write_batch.cc             | 3 ++-
 include/rocksdb/write_batch.h | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 297ac0ade..3e470389b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3789,7 +3789,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) {
 // Default implementations of convenience methods that subclasses of DB
 // can call if they wish
 Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
-  WriteBatch batch;
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24);
   batch.Put(key, value);
   return Write(opt, &batch);
 }
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 76e4381a5..4f8392d84 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -35,7 +35,8 @@ namespace rocksdb {
 // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
 static const size_t kHeader = 12;
 
-WriteBatch::WriteBatch() {
+WriteBatch::WriteBatch(size_t reserved_bytes) {
+  rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
   Clear();
 }
 
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 798807045..1c6210fe9 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -35,7 +35,7 @@ struct SliceParts;
 
 class WriteBatch {
  public:
-  WriteBatch();
+  explicit WriteBatch(size_t reserved_bytes = 0);
   ~WriteBatch();
 
   // Store the mapping "key->value" in the database.

From cd535c2280dea4c45b5138bce27870ff1c633d11 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Tue, 14 Jan 2014 21:30:13 -0800
Subject: [PATCH 52/70] Optimize MayContainHash()

Summary:
In latest leaf's, MayContainHash() consistently consumes 5%~7% CPU usage.

I checked the code and did an experiment with/without inlining this method.

In release mode, with `1024 * 1024 * 256` bits and `1024 * 512` entries, both call 2^30 MayContainHash() with distinctive parameters.

As the result showed, this patch reduced the running time from 9.127 sec to 7.891 sec.

Test Plan: make check

Reviewers: sdong, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15177
---
 util/dynamic_bloom.cc | 47 +++++++------------------------------------
 util/dynamic_bloom.h  | 33 +++++++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 45 deletions(-)

diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 84f964d9e..94df660ef 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -20,50 +20,17 @@ DynamicBloom::DynamicBloom(uint32_t total_bits,
                            uint32_t (*hash_func)(const Slice& key),
                            uint32_t num_probes)
     : hash_func_(hash_func),
-      total_bits_((total_bits + 7) / 8 * 8),
-      num_probes_(num_probes) {
+      kTotalBits((total_bits + 7) / 8 * 8),
+      kNumProbes(num_probes) {
   assert(hash_func_);
-  assert(num_probes_ > 0);
-  assert(total_bits_ > 0);
-  data_.reset(new unsigned char[total_bits_ / 8]());
+  assert(kNumProbes > 0);
+  assert(kTotalBits > 0);
+  data_.reset(new unsigned char[kTotalBits / 8]());
 }
 
 DynamicBloom::DynamicBloom(uint32_t total_bits,
                            uint32_t num_probes)
-    : hash_func_(&BloomHash),
-      total_bits_((total_bits + 7) / 8 * 8),
-      num_probes_(num_probes) {
-  assert(num_probes_ > 0);
-  assert(total_bits_ > 0);
-  data_.reset(new unsigned char[total_bits_ / 8]());
+    : DynamicBloom(total_bits, &BloomHash, num_probes) {
 }
 
-void DynamicBloom::Add(const Slice& key) {
-  AddHash(hash_func_(key));
-}
-
-void DynamicBloom::AddHash(uint32_t h) {
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  for (uint32_t i = 0; i < num_probes_; i++) {
-    const uint32_t bitpos = h % total_bits_;
-    data_[bitpos/8] |= (1 << (bitpos % 8));
-    h += delta;
-  }
-}
-
-bool DynamicBloom::MayContain(const Slice& key) {
-  return (MayContainHash(hash_func_(key)));
-}
-
-bool DynamicBloom::MayContainHash(uint32_t h) {
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  for (uint32_t i = 0; i < num_probes_; i++) {
-    const uint32_t bitpos = h % total_bits_;
-    if ((data_[bitpos/8] & (1 << (bitpos % 8)))
-        == 0) return false;
-    h += delta;
-  }
-  return true;
-}
-
-}
+}  // rocksdb
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index aa29a4ae7..2b699dc77 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -13,9 +13,7 @@ namespace rocksdb {
 class Slice;
 
 class DynamicBloom {
-
  public:
-
   // total_bits: fixed total bits for the bloom
   // hash_func:  customized hash function
   // num_probes: number of hash probes for a single key
@@ -26,7 +24,7 @@ class DynamicBloom {
   explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6);
 
   // Assuming single threaded access to Add
-  void Add(const Slice& key);
+  void Add(const Slice& key) { AddHash(hash_func_(key)); }
 
   // Assuming single threaded access to Add
   void AddHash(uint32_t hash);
@@ -39,9 +37,34 @@ class DynamicBloom {
 
  private:
   uint32_t (*hash_func_)(const Slice& key);
-  uint32_t total_bits_;
-  uint32_t num_probes_;
+  const uint32_t kTotalBits;
+  const uint32_t kNumProbes;
   std::unique_ptr<unsigned char[]> data_;
 };
 
+inline bool DynamicBloom::MayContain(const Slice& key) {
+  return (MayContainHash(hash_func_(key)));
+}
+
+inline bool DynamicBloom::MayContainHash(uint32_t h) {
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  for (uint32_t i = 0; i < kNumProbes; i++) {
+    const uint32_t bitpos = h % kTotalBits;
+    if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+      return false;
+    }
+    h += delta;
+  }
+  return true;
 }
+
+inline void DynamicBloom::AddHash(uint32_t h) {
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  for (uint32_t i = 0; i < kNumProbes; i++) {
+    const uint32_t bitpos = h % kTotalBits;
+    data_[bitpos / 8] |= (1 << (bitpos % 8));
+    h += delta;
+  }
+}
+
+}  // rocksdb

From 1447bb5919256ac94872821b48baff3cf2b99c26 Mon Sep 17 00:00:00 2001
From: Naman Gupta <nkgupta@fb.com>
Date: Thu, 16 Jan 2014 15:11:19 -0800
Subject: [PATCH 53/70] Allow callback to change size of existing value. Change
 return type of the callback function to an enum status to handle 3 cases.

Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
    1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
    2. Generate a new buffer indicating merged value. Returns 2.
    3. Fails to do either of above, based on whatever application logic. Returns 0.

Test Plan: Just make all for now. I'm adding another unit test to test each scenario.

Reviewers: dhruba, haobo

Reviewed By: haobo

CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo

Differential Revision: https://reviews.facebook.net/D15195
---
 db/db_test.cc             | 100 ++++++++++++++++++++++++++++++++------
 db/memtable.cc            |  76 ++++++++++++++---------------
 db/write_batch.cc         |  26 +++++-----
 include/rocksdb/options.h |  55 +++++++++++++--------
 4 files changed, 170 insertions(+), 87 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 9a5d128df..42bcf8277 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -712,20 +712,49 @@ class DBTest {
   // If previous value is nullptr or delta is > than previous value,
   //   sets newValue with delta
   // If previous value is not empty,
-  //   updates previous value with 'b' string of previous value size
-  static bool updateInPlace(char* prevValue, size_t prevSize,
-                            Slice delta, std::string* newValue) {
-    if (prevValue == nullptr || delta.size() > prevSize) {
+  //   updates previous value with 'b' string of previous value size - 1.
+  static UpdateStatus
+      updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize,
+                               Slice delta, std::string* newValue) {
+    if (prevValue == nullptr) {
       *newValue = std::string(delta.size(), 'c');
-      return false;
+      return UpdateStatus::UPDATED;
     } else {
-      std::string str_b = std::string(prevSize, 'b');
+      *prevSize = *prevSize - 1;
+      std::string str_b = std::string(*prevSize, 'b');
       memcpy(prevValue, str_b.c_str(), str_b.size());
-      return true;
+      return UpdateStatus::UPDATED_INPLACE;
     }
   }
 
-  // Used to test InplaceUpdate
+  static UpdateStatus
+      updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize,
+                                     Slice delta, std::string* newValue) {
+    if (prevValue == nullptr) {
+      *newValue = std::string(delta.size(), 'c');
+      return UpdateStatus::UPDATED;
+    } else {
+      *prevSize = 1;
+      std::string str_b = std::string(*prevSize, 'b');
+      memcpy(prevValue, str_b.c_str(), str_b.size());
+      return UpdateStatus::UPDATED_INPLACE;
+    }
+  }
+
+  static UpdateStatus
+      updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize,
+                              Slice delta, std::string* newValue) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  }
+
+  static UpdateStatus
+      updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+                            Slice delta, std::string* newValue) {
+    return UpdateStatus::UPDATE_FAILED;
+  }
+
+  // Utility method to test InplaceUpdate
   void validateNumberOfEntries(int numValues) {
       Iterator* iter = dbfull()->TEST_NewInternalIterator();
       iter->SeekToFirst();
@@ -2619,7 +2648,7 @@ TEST(DBTest, InPlaceUpdateLargeNewValue) {
 }
 
 
-TEST(DBTest, InPlaceUpdateCallback) {
+TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
   do {
     Options options = CurrentOptions();
     options.create_if_missing = true;
@@ -2628,7 +2657,7 @@ TEST(DBTest, InPlaceUpdateCallback) {
     options.env = env_;
     options.write_buffer_size = 100000;
     options.inplace_callback =
-      rocksdb::DBTest::updateInPlace;
+      rocksdb::DBTest::updateInPlaceSmallerSize;
     Reopen(&options);
 
     // Update key with values of smaller size
@@ -2638,7 +2667,7 @@ TEST(DBTest, InPlaceUpdateCallback) {
 
     for (int i = numValues; i > 0; i--) {
       ASSERT_OK(Put("key", DummyString(i, 'a')));
-      ASSERT_EQ(DummyString(numValues, 'b'), Get("key"));
+      ASSERT_EQ(DummyString(i - 1, 'b'), Get("key"));
     }
 
     // Only 1 instance for that key.
@@ -2647,9 +2676,31 @@ TEST(DBTest, InPlaceUpdateCallback) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, InPlaceUpdateCallbackNotFound) {
+TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
   do {
-    //Test sst get/update/put
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceSmallerVarintSize;
+    Reopen(&options);
+
+    // Update key with values of smaller varint size
+    int numValues = 265;
+    ASSERT_OK(Put("key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get("key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put("key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(1, 'b'), Get("key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1);
+
   } while (ChangeCompactOptions());
 }
 
@@ -2662,12 +2713,12 @@ TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
     options.env = env_;
     options.write_buffer_size = 100000;
     options.inplace_callback =
-      rocksdb::DBTest::updateInPlace;
+      rocksdb::DBTest::updateInPlaceLargerSize;
     Reopen(&options);
 
     // Update key with values of larger size
     int numValues = 10;
-    for (int i = 1; i <= numValues; i++) {
+    for (int i = 0; i < numValues; i++) {
       ASSERT_OK(Put("key", DummyString(i, 'a')));
       ASSERT_EQ(DummyString(i, 'c'), Get("key"));
     }
@@ -2679,6 +2730,25 @@ TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
   } while (ChangeCompactOptions());
 }
 
+TEST(DBTest, InPlaceUpdateCallbackNoAction) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceNoAction;
+    Reopen(&options);
+
+    // Callback function requests no actions from db
+    ASSERT_OK(Put("key", DummyString(1, 'a')));
+    ASSERT_EQ(Get("key"), "NOT_FOUND");
+
+  } while (ChangeCompactOptions());
+}
+
 // This is a static filter used for filtering
 // kvs during the compaction process.
 static int cfilter_count;
diff --git a/db/memtable.cc b/db/memtable.cc
index 430c3589b..73a24f8d7 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -346,21 +346,21 @@ void MemTable::Update(SequenceNumber seq,
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t prev_value_size = prev_value.size();
-          uint32_t new_value_size = value.size();
+          uint32_t prev_size = prev_value.size();
+          uint32_t new_size = value.size();
 
-          // Update value, if newValue size  <= curValue size
-          if (new_value_size <= prev_value_size ) {
+          // Update value, if new value size  <= previous value size
+          if (new_size <= prev_size ) {
             char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
-                                     new_value_size);
+                                     new_size);
             WriteLock wl(GetLock(lkey.user_key()));
-            memcpy(p, value.data(), new_value_size);
+            memcpy(p, value.data(), new_size);
             assert(
-              (p + new_value_size) - entry ==
+              (p + new_size) - entry ==
               (unsigned) (VarintLength(key_length) +
                           key_length +
-                          VarintLength(new_value_size) +
-                          new_value_size)
+                          VarintLength(new_size) +
+                          new_size)
             );
             // no need to update bloom, as user key does not change.
             return;
@@ -380,9 +380,9 @@ void MemTable::Update(SequenceNumber seq,
 }
 
 bool MemTable::UpdateCallback(SequenceNumber seq,
-                      const Slice& key,
-                      const Slice& delta,
-                      const Options& options) {
+                              const Slice& key,
+                              const Slice& delta,
+                              const Options& options) {
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
@@ -410,39 +410,35 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t prev_value_size = prev_value.size();
+          uint32_t  prev_size = prev_value.size();
+
+          char* prev_buffer = const_cast<char*>(prev_value.data());
+          uint32_t  new_prev_size = prev_size;
 
-          WriteLock wl(GetLock(lkey.user_key()));
           std::string str_value;
-          if (options.inplace_callback(const_cast<char*>(prev_value.data()),
-                                       prev_value_size, delta, &str_value)) {
+          WriteLock wl(GetLock(lkey.user_key()));
+          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
+                                                    delta, &str_value);
+          if (status == UpdateStatus::UPDATED_INPLACE) {
             // Value already updated by callback.
-            // TODO: Change size of value in memtable slice.
-            //   This works for leaf, since size is already encoded in the
-            //   value. It doesn't depend on rocksdb buffer size.
+            assert(new_prev_size <= prev_size);
+            if (new_prev_size < prev_size) {
+              // overwrite the new prev_size
+              char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                       new_prev_size);
+              if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+                // shift the value buffer as well.
+                memcpy(p, prev_buffer, new_prev_size);
+              }
+            }
+            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
             return true;
-          }
-          Slice slice_value = Slice(str_value);
-          uint32_t new_value_size = slice_value.size();
-
-          // Update value, if newValue size  <= curValue size
-          if (new_value_size <= prev_value_size ) {
-            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
-                                     new_value_size);
-
-            memcpy(p, slice_value.data(), new_value_size);
-            assert(
-              (p + new_value_size) - entry ==
-              (unsigned) (VarintLength(key_length) +
-                          key_length +
-                          VarintLength(new_value_size) +
-                          new_value_size)
-            );
+          } else if (status == UpdateStatus::UPDATED) {
+            Add(seq, kTypeValue, key, Slice(str_value));
+            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
             return true;
-          } else {
-            // If we don't have enough space to update in-place
-            // Return as NotUpdatable, and do normal Add()
-            Add(seq, kTypeValue, key, slice_value);
+          } else if (status == UpdateStatus::UPDATE_FAILED) {
+            // No action required. Return.
             return true;
           }
         }
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 6a427f1a6..72fd2a9ea 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -203,9 +203,8 @@ class MemTableInserter : public WriteBatch::Handler {
       RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED);
     } else {
       if (mem_->UpdateCallback(sequence_, key, value, *options_)) {
-        RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED);
       } else {
-        // key not found in memtable. Do sst get/update/add
+        // key not found in memtable. Do sst get, update, add
         SnapshotImpl read_from_snapshot;
         read_from_snapshot.number_ = sequence_;
         ReadOptions ropts;
@@ -215,22 +214,25 @@ class MemTableInserter : public WriteBatch::Handler {
         std::string merged_value;
         Status s = db_->Get(ropts, key, &prev_value);
         char* prev_buffer = const_cast<char*>(prev_value.c_str());
-        size_t prev_size = prev_value.size();
-        if (options_->inplace_callback(s.ok() ? prev_buffer: nullptr,
-                                       s.ok() ? prev_size: 0,
-                                       value, &merged_value)) {
+        uint32_t prev_size = prev_value.size();
+        auto status =
+          options_->inplace_callback(s.ok() ? prev_buffer: nullptr,
+                                     s.ok() ? &prev_size: nullptr,
+                                     value, &merged_value);
+        if (status == UpdateStatus::UPDATED_INPLACE) {
           // prev_value is updated in-place with final value.
           mem_->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
           RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
-        } else {
-          // merged_value contains the final value. Only add if not empty.
-          if (!merged_value.empty()) {
-            mem_->Add(sequence_, kTypeValue, key, Slice(merged_value));
-            RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
-          }
+        } else if (status == UpdateStatus::UPDATED) {
+          // merged_value contains the final value.
+          mem_->Add(sequence_, kTypeValue, key, Slice(merged_value));
+          RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
         }
       }
     }
+    // Since all Puts are logged in trasaction logs (if enabled), always bump
+    // sequence number. Even if the update eventually fails and does not result
+    // in memtable add/update.
     sequence_++;
   }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 8499f6025..cf2daa819 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -65,6 +65,12 @@ struct CompressionOptions {
       : window_bits(wbits), level(lev), strategy(strategy) {}
 };
 
+enum UpdateStatus {    // Return status For inplace update callback
+  UPDATE_FAILED   = 0, // Nothing to update
+  UPDATED_INPLACE = 1, // Value updated inplace
+  UPDATED         = 2, // No inplace update. Merged value set
+};
+
 // Options to control the behavior of a database (passed to DB::Open)
 struct Options {
   // -------------------
@@ -650,38 +656,47 @@ struct Options {
   // Default: 10000, if inplace_update_support = true, else 0.
   size_t inplace_update_num_locks;
 
-  // * existing_value - pointer to previous value (from both memtable and sst).
-  //                    nullptr if key doesn't exist
-  // * existing_value_size - sizeof(existing_value). 0 if key doesn't exist
-  // * delta_value - Delta value to be merged with the 'existing_value'.
-  //                 Stored in transaction logs.
-  // * merged_value - Set when delta is applied on the previous value.
+  // existing_value - pointer to previous value (from both memtable and sst).
+  //                  nullptr if key doesn't exist
+  // existing_value_size - pointer to size of existing_value).
+  //                       nullptr if key doesn't exist
+  // delta_value - Delta value to be merged with the existing_value.
+  //               Stored in transaction logs.
+  // merged_value - Set when delta is applied on the previous value.
 
   // Applicable only when inplace_update_support is true,
   // this callback function is called at the time of updating the memtable
   // as part of a Put operation, lets say Put(key, delta_value). It allows the
   // 'delta_value' specified as part of the Put operation to be merged with
-  // an 'existing_value' of the 'key' in the database.
+  // an 'existing_value' of the key in the database.
 
   // If the merged value is smaller in size that the 'existing_value',
-  // then this function can update the 'existing_value' buffer inplace if it
-  // wishes to. The callback should return true in this case. (In this case,
-  // the snapshot-semantics of the rocksdb Iterator is not atomic anymore).
-
-  // If the application does not wish to modify the 'existing_value' buffer
-  // inplace, then it should allocate a new buffer and update it by merging the
-  // 'existing_value' and the Put 'delta_value' and set the 'merged_value'
-  // pointer to this buffer. The callback should return false in this case. It
-  // is upto the calling layer to manage the memory returned in 'merged_value'.
+  // then this function can update the 'existing_value' buffer inplace and
+  // the corresponding 'existing_value'_size pointer, if it wishes to.
+  // The callback should return UpdateStatus::UPDATED_INPLACE.
+  // In this case. (In this case, the snapshot-semantics of the rocksdb
+  // Iterator is not atomic anymore).
+
+  // If the merged value is larger in size than the 'existing_value' or the
+  // application does not wish to modify the 'existing_value' buffer inplace,
+  // then the merged value should be returned via *merge_value. It is set by
+  // merging the 'existing_value' and the Put 'delta_value'. The callback should
+  // return UpdateStatus::UPDATED in this case. This merged value will be added
+  // to the memtable.
+
+  // If merging fails or the application does not wish to take any action,
+  // then the callback should return UpdateStatus::UPDATE_FAILED.
 
   // Please remember that the original call from the application is Put(key,
-  // delta_value). So the transaction log (if enabled) will still contain
-  // (key, delta_value). The 'merged_value' is not stored in the transaction log
+  // delta_value). So the transaction log (if enabled) will still contain (key,
+  // delta_value). The 'merged_value' is not stored in the transaction log.
   // Hence the inplace_callback function should be consistent across db reopens.
 
   // Default: nullptr
-  bool (*inplace_callback)(char* existing_value, size_t existing_value_size,
-                           Slice delta_value, std::string* merged_value);
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
 
   // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
   // for memtable

From 23576d773fb3eaf2ca3a1a3b9bc4afa0f03ab958 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Thu, 16 Jan 2014 17:00:34 -0800
Subject: [PATCH 54/70] Remove the extra line in "make release"

Summary:

that line was introduced during merge.
---
 Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Makefile b/Makefile
index b770ca7b0..ae1ee56f2 100644
--- a/Makefile
+++ b/Makefile
@@ -139,7 +139,6 @@ all: $(LIBRARY) $(PROGRAMS)
 # Will also generate shared libraries. 
 release:
 	$(MAKE) clean
-	OPT="-DNDEBUG -O2" $(MAKE) -j32
 	OPT="-DNDEBUG -O2" $(MAKE) all -j32
 	OPT="-DNDEBUG -O2" $(MAKE) $(SHARED) -j32
 

From ef602f62750a5a56c48c5c1dad22a0705bd2d2be Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Fri, 17 Jan 2014 12:22:39 -0800
Subject: [PATCH 55/70] Misc cleanup on performance branch

Summary:

Did some trivial stuffs:

* Add more comments;
* fix compiler's warning messages (uninitialized variables).
* etc

Test Plan:

make check
---
 build_tools/format-diff.sh  |  2 --
 db/memtable.cc              |  8 ++++----
 table/plain_table_reader.cc | 10 +++++-----
 util/coding_test.cc         |  6 +++---
 util/dynamic_bloom.h        | 12 +++++++-----
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index ceae38192..2d6062009 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -47,7 +47,6 @@ fi
 #     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
 #   fi
 # fi
-
 set -e
 
 uncommitted_code=`git diff HEAD`
@@ -55,7 +54,6 @@ uncommitted_code=`git diff HEAD`
 # If there's no uncommitted changes, we assume user are doing post-commit
 # format check, in which case we'll check the modified lines from latest commit.
 # Otherwise, we'll check format of the uncommitted code only.
-format_last_commit=0
 if [ -z "$uncommitted_code" ]
 then
   # Check the format of last commit
diff --git a/db/memtable.cc b/db/memtable.cc
index 73a24f8d7..ff282a366 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -232,7 +232,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     // sequence number since the Seek() call above should have skipped
     // all entries with overly large sequence numbers.
     const char* entry = iter->key();
-    uint32_t key_length;
+    uint32_t key_length = 0;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
     if (comparator_.comparator.user_comparator()->Compare(
         Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
@@ -337,7 +337,7 @@ void MemTable::Update(SequenceNumber seq,
     // sequence number since the Seek() call above should have skipped
     // all entries with overly large sequence numbers.
     const char* entry = iter->key();
-    uint32_t key_length;
+    uint32_t key_length = 0;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
     if (comparator_.comparator.user_comparator()->Compare(
         Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
@@ -401,7 +401,7 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
     // sequence number since the Seek() call above should have skipped
     // all entries with overly large sequence numbers.
     const char* entry = iter->key();
-    uint32_t key_length;
+    uint32_t key_length = 0;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
     if (comparator_.comparator.user_comparator()->Compare(
         Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
@@ -466,7 +466,7 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
 
   for (; iter->Valid(); iter->Next()) {
     const char* entry = iter->key();
-    uint32_t key_length;
+    uint32_t key_length = 0;
     const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
     if (!comparator_.comparator.user_comparator()->Compare(
         Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 4c396a359..2e3838547 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -405,7 +405,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
   uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
 
   const char* index_ptr = sub_index_ + prefix_index_offset;
-  uint32_t upper_bound;
+  uint32_t upper_bound = 0;
   const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr,
                                                               index_ptr + 4,
                                                               &upper_bound);
@@ -464,17 +464,17 @@ bool PlainTableReader::MayHavePrefix(uint32_t hash) {
 
 Status PlainTableReader::ReadKey(const char* row_ptr, Slice* key,
                                  size_t& bytes_read) {
-  const char* key_ptr;
+  const char* key_ptr = nullptr;
   bytes_read = 0;
-  size_t internal_key_size;
+  size_t internal_key_size = 0;
   if (IsFixedLength()) {
     internal_key_size = GetFixedInternalKeyLength();
     key_ptr = row_ptr;
   } else {
-    uint32_t key_size;
+    uint32_t key_size = 0;
     key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_,
                              &key_size);
-    internal_key_size = (size_t) key_size;
+    internal_key_size = (size_t)key_size;
     bytes_read = key_ptr - row_ptr;
   }
   if (row_ptr + internal_key_size >= file_data_.data() + data_end_offset_) {
diff --git a/util/coding_test.cc b/util/coding_test.cc
index fb0613238..ed542d6bf 100644
--- a/util/coding_test.cc
+++ b/util/coding_test.cc
@@ -41,7 +41,7 @@ TEST(Coding, Fixed64) {
   const char* p = s.data();
   for (int power = 0; power <= 63; power++) {
     uint64_t v = static_cast<uint64_t>(1) << power;
-    uint64_t actual;
+    uint64_t actual = 0;
     actual = DecodeFixed64(p);
     ASSERT_EQ(v-1, actual);
     p += sizeof(uint64_t);
@@ -90,7 +90,7 @@ TEST(Coding, Varint32) {
   const char* limit = p + s.size();
   for (uint32_t i = 0; i < (32 * 32); i++) {
     uint32_t expected = (i / 32) << (i % 32);
-    uint32_t actual;
+    uint32_t actual = 0;
     const char* start = p;
     p = GetVarint32Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
@@ -125,7 +125,7 @@ TEST(Coding, Varint64) {
   const char* limit = p + s.size();
   for (unsigned int i = 0; i < values.size(); i++) {
     ASSERT_TRUE(p < limit);
-    uint64_t actual;
+    uint64_t actual = 0;
     const char* start = p;
     p = GetVarint64Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 2b699dc77..0851becbf 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -23,16 +23,16 @@ class DynamicBloom {
 
   explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6);
 
-  // Assuming single threaded access to Add
-  void Add(const Slice& key) { AddHash(hash_func_(key)); }
+  // Assuming single threaded access to this function.
+  void Add(const Slice& key);
 
-  // Assuming single threaded access to Add
+  // Assuming single threaded access to this function.
   void AddHash(uint32_t hash);
 
-  // Multithreaded access to MayContain is OK
+  // Multithreaded access to this function is OK
   bool MayContain(const Slice& key);
 
-  // Multithreaded access to MayContain is OK
+  // Multithreaded access to this function is OK
   bool MayContainHash(uint32_t hash);
 
  private:
@@ -42,6 +42,8 @@ class DynamicBloom {
   std::unique_ptr<unsigned char[]> data_;
 };
 
+inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
+
 inline bool DynamicBloom::MayContain(const Slice& key) {
   return (MayContainHash(hash_func_(key)));
 }

From 7dea558e6d202c9dbf34e15077d5c9c8db594bf9 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 21 Jan 2014 11:55:13 -0800
Subject: [PATCH 56/70] [Performance Branch] Fix a bug when merging from master

Summary: Commit "1304d8c8cefe66be1a3caa5e93413211ba2486f2" (Merge branch 'master' into performance) removes a line in performance branch by mistake. This patch fixes it.

Test Plan: make all check

Reviewers: haobo, kailiu, igor

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15297
---
 db/memtable.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/memtable.cc b/db/memtable.cc
index ff282a366..03e078459 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -95,6 +95,7 @@ class MemTableIterator: public Iterator {
     if (options.prefix) {
       iter_.reset(mem_.table_->GetPrefixIterator(*options.prefix));
     } else if (options.prefix_seek) {
+      dynamic_prefix_seek_ = true;
       iter_.reset(mem_.table_->GetDynamicPrefixIterator());
     } else {
       iter_.reset(mem_.table_->GetIterator());

From bb19b530cadf873e42a9901493dda4e0654ffddc Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Thu, 23 Jan 2014 15:51:26 -0800
Subject: [PATCH 57/70] Aggressively inlining the short functions in coding.cc

Summary:
This diff takes an even more aggressive way to inline the functions. A decent rule that I followed is "not inline a function if it is more than 10 lines long."

Normally optimizing code by inline is ugly and hard to control, but since one of our usecase has significant amount of CPU used in functions from coding.cc, I'd like to try this diff out.

Test Plan:
1. the size for some .o file increased a little bit, but most less than 1%. So I think the negative impact of inline is negligible.
2. As the regression test shows (ran for 10 times and I calculated the average number)

    Metrics                                         Befor    After
    ========================================================================
    rocksdb.build.fillseq.qps                       426595   444515    (+4.6%)
    rocksdb.build.memtablefillrandom.qps            121739   123110
    rocksdb.build.memtablereadrandom.qps            1285103  1280520
    rocksdb.build.overwrite.qps                     125816   135570    (+9%)
    rocksdb.build.readrandom_fillunique_random.qps  285995   296863
    rocksdb.build.readrandom_memtable_sst.qps       1027132  1027279
    rocksdb.build.readrandom.qps                    1041427  1054665
    rocksdb.build.readrandom_smallblockcache.qps    1028631  1038433
    rocksdb.build.readwhilewriting.qps              918352   914629

Reviewers: haobo, sdong, igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15291
---
 include/rocksdb/options.h |   2 +-
 util/coding.cc            | 193 +++-----------------------------------
 util/coding.h             | 162 ++++++++++++++++++++++++++++++++
 3 files changed, 178 insertions(+), 179 deletions(-)

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index cf2daa819..672c66ed8 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -136,7 +136,7 @@ struct Options {
   // errors.  This may have unforeseen ramifications: for example, a
   // corruption of one DB entry may cause a large number of entries to
   // become unreadable or for the entire DB to become unopenable.
-  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
+  // If any of the writes to the database fails (Put, Delete, Merge, Write),
   // the database will switch to read-only mode and fail all other
   // Write operations.
   // Default: false
diff --git a/util/coding.cc b/util/coding.cc
index ce67fa486..c858697f8 100644
--- a/util/coding.cc
+++ b/util/coding.cc
@@ -8,128 +8,39 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/coding.h"
-
 #include <algorithm>
 
 namespace rocksdb {
 
-void EncodeFixed32(char* buf, uint32_t value) {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  memcpy(buf, &value, sizeof(value));
-#else
-  buf[0] = value & 0xff;
-  buf[1] = (value >> 8) & 0xff;
-  buf[2] = (value >> 16) & 0xff;
-  buf[3] = (value >> 24) & 0xff;
-#endif
-}
-
-void EncodeFixed64(char* buf, uint64_t value) {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  memcpy(buf, &value, sizeof(value));
-#else
-  buf[0] = value & 0xff;
-  buf[1] = (value >> 8) & 0xff;
-  buf[2] = (value >> 16) & 0xff;
-  buf[3] = (value >> 24) & 0xff;
-  buf[4] = (value >> 32) & 0xff;
-  buf[5] = (value >> 40) & 0xff;
-  buf[6] = (value >> 48) & 0xff;
-  buf[7] = (value >> 56) & 0xff;
-#endif
-}
-
-void PutFixed32(std::string* dst, uint32_t value) {
-  char buf[sizeof(value)];
-  EncodeFixed32(buf, value);
-  dst->append(buf, sizeof(buf));
-}
-
-void PutFixed64(std::string* dst, uint64_t value) {
-  char buf[sizeof(value)];
-  EncodeFixed64(buf, value);
-  dst->append(buf, sizeof(buf));
-}
-
 char* EncodeVarint32(char* dst, uint32_t v) {
   // Operate on characters as unsigneds
   unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
   static const int B = 128;
-  if (v < (1<<7)) {
+  if (v < (1 << 7)) {
     *(ptr++) = v;
-  } else if (v < (1<<14)) {
+  } else if (v < (1 << 14)) {
     *(ptr++) = v | B;
-    *(ptr++) = v>>7;
-  } else if (v < (1<<21)) {
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
     *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = v>>14;
-  } else if (v < (1<<28)) {
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
     *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = (v>>14) | B;
-    *(ptr++) = v>>21;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
   } else {
     *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = (v>>14) | B;
-    *(ptr++) = (v>>21) | B;
-    *(ptr++) = v>>28;
-  }
-  return reinterpret_cast<char*>(ptr);
-}
-
-void PutVarint32(std::string* dst, uint32_t v) {
-  char buf[5];
-  char* ptr = EncodeVarint32(buf, v);
-  dst->append(buf, ptr - buf);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const unsigned int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B-1)) | B;
-    v >>= 7;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
   }
-  *(ptr++) = static_cast<unsigned char>(v);
   return reinterpret_cast<char*>(ptr);
 }
 
-void PutVarint64(std::string* dst, uint64_t v) {
-  char buf[10];
-  char* ptr = EncodeVarint64(buf, v);
-  dst->append(buf, ptr - buf);
-}
-
-void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
-  PutVarint32(dst, value.size());
-  dst->append(value.data(), value.size());
-}
-
-void PutLengthPrefixedSliceParts(std::string* dst,
-                                 const SliceParts& slice_parts) {
-  uint32_t total_bytes = 0;
-  for (int i = 0; i < slice_parts.num_parts; ++i) {
-    total_bytes += slice_parts.parts[i].size();
-  }
-  PutVarint32(dst, total_bytes);
-  for (int i = 0; i < slice_parts.num_parts; ++i) {
-    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
-  }
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-const char* GetVarint32PtrFallback(const char* p,
-                                   const char* limit,
+const char* GetVarint32PtrFallback(const char* p, const char* limit,
                                    uint32_t* value) {
   uint32_t result = 0;
   for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
@@ -147,18 +58,6 @@ const char* GetVarint32PtrFallback(const char* p,
   return nullptr;
 }
 
-bool GetVarint32(Slice* input, uint32_t* value) {
-  const char* p = input->data();
-  const char* limit = p + input->size();
-  const char* q = GetVarint32Ptr(p, limit, value);
-  if (q == nullptr) {
-    return false;
-  } else {
-    *input = Slice(q, limit - q);
-    return true;
-  }
-}
-
 const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
   uint64_t result = 0;
   for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
@@ -176,58 +75,6 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
   return nullptr;
 }
 
-bool GetVarint64(Slice* input, uint64_t* value) {
-  const char* p = input->data();
-  const char* limit = p + input->size();
-  const char* q = GetVarint64Ptr(p, limit, value);
-  if (q == nullptr) {
-    return false;
-  } else {
-    *input = Slice(q, limit - q);
-    return true;
-  }
-}
-
-const char* GetLengthPrefixedSlice(const char* p, const char* limit,
-                                   Slice* result) {
-  uint32_t len;
-  p = GetVarint32Ptr(p, limit, &len);
-  if (p == nullptr) return nullptr;
-  if (p + len > limit) return nullptr;
-  *result = Slice(p, len);
-  return p + len;
-}
-
-bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
-  uint32_t len;
-  if (GetVarint32(input, &len) &&
-      input->size() >= len) {
-    *result = Slice(input->data(), len);
-    input->remove_prefix(len);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-Slice GetLengthPrefixedSlice(const char* data) {
-  uint32_t len;
-  const char* p = data;
-  p = GetVarint32Ptr(p, p + 5, &len);  // +5: we assume "p" is not corrupted
-  return Slice(p, len);
-}
-
-Slice GetSliceUntil(Slice* slice, char delimiter) {
-  uint32_t len;
-  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
-    // nothing
-  }
-
-  Slice ret(slice->data(), len);
-  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
-  return ret;
-}
-
 void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
                      uint32_t bits, uint64_t value) {
   assert((offset + bits + 7)/8 <= dstlen);
@@ -316,14 +163,4 @@ void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
          BitStreamGetInt(dst, offset, bits));
 }
 
-uint64_t BitStreamGetInt(const std::string* src, size_t offset,
-                         uint32_t bits) {
-  return BitStreamGetInt(src->data(), src->size(), offset, bits);
-}
-
-uint64_t BitStreamGetInt(const Slice* src, size_t offset,
-                         uint32_t bits) {
-  return BitStreamGetInt(src->data(), src->size(), offset, bits);
-}
-
 }  // namespace rocksdb
diff --git a/util/coding.h b/util/coding.h
index 4477dc799..168a49bff 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -13,6 +13,7 @@
 // * Strings are encoded prefixed by their length in varint format
 
 #pragma once
+#include <algorithm>
 #include <stdint.h>
 #include <string.h>
 #include <string>
@@ -136,4 +137,165 @@ extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
 extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
                                 uint32_t bits);
 
+// -- Implementation of the functions declared above
+inline void EncodeFixed32(char* buf, uint32_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+#endif
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+  buf[4] = (value >> 32) & 0xff;
+  buf[5] = (value >> 40) & 0xff;
+  buf[6] = (value >> 48) & 0xff;
+  buf[7] = (value >> 56) & 0xff;
+#endif
+}
+
+inline void PutFixed32(std::string* dst, uint32_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed32(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+inline void PutFixed64(std::string* dst, uint64_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed64(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+inline void PutVarint32(std::string* dst, uint32_t v) {
+  char buf[5];
+  char* ptr = EncodeVarint32(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+inline char* EncodeVarint64(char* dst, uint64_t v) {
+  static const unsigned int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+inline void PutVarint64(std::string* dst, uint64_t v) {
+  char buf[10];
+  char* ptr = EncodeVarint64(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+  PutVarint32(dst, value.size());
+  dst->append(value.data(), value.size());
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts) {
+  uint32_t total_bytes = 0;
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    total_bytes += slice_parts.parts[i].size();
+  }
+  PutVarint32(dst, total_bytes);
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+  }
+}
+
+inline int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+inline bool GetVarint32(Slice* input, uint32_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint32Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+inline bool GetVarint64(Slice* input, uint64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+inline const char* GetLengthPrefixedSlice(const char* p, const char* limit,
+                                          Slice* result) {
+  uint32_t len = 0;
+  p = GetVarint32Ptr(p, limit, &len);
+  if (p == nullptr) return nullptr;
+  if (p + len > limit) return nullptr;
+  *result = Slice(p, len);
+  return p + len;
+}
+
+inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+  uint32_t len = 0;
+  if (GetVarint32(input, &len) && input->size() >= len) {
+    *result = Slice(input->data(), len);
+    input->remove_prefix(len);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len = 0;
+  const char* p = data;
+  p = GetVarint32Ptr(p, p + 5, &len);  // +5: we assume "p" is not corrupted
+  return Slice(p, len);
+}
+
+inline Slice GetSliceUntil(Slice* slice, char delimiter) {
+  uint32_t len = 0;
+  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+    // nothing
+  }
+
+  Slice ret(slice->data(), len);
+  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+  return ret;
+}
+
+inline uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+                                uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+inline uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+                                uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
 }  // namespace rocksdb

From eda924a03a54a082c5e95b727cf6d1da27cf751b Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Thu, 23 Jan 2014 22:59:04 -0800
Subject: [PATCH 58/70] Remove an unused `GetLengthPrefixedSlice`

Summary: We have 3 versions of GetLengthPrefixedSlice() and one of them is no longer in use.

Test Plan: make

Reviewers: sdong, igor, haobo, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15399
---
 util/coding.h | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/util/coding.h b/util/coding.h
index 168a49bff..6dd8cb20f 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -39,6 +39,7 @@ extern void PutLengthPrefixedSliceParts(std::string* dst,
 extern bool GetVarint32(Slice* input, uint32_t* value);
 extern bool GetVarint64(Slice* input, uint64_t* value);
 extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+// This function assumes data is well-formed.
 extern Slice GetLengthPrefixedSlice(const char* data);
 
 extern Slice GetSliceUntil(Slice* slice, char delimiter);
@@ -249,16 +250,6 @@ inline bool GetVarint64(Slice* input, uint64_t* value) {
   }
 }
 
-inline const char* GetLengthPrefixedSlice(const char* p, const char* limit,
-                                          Slice* result) {
-  uint32_t len = 0;
-  p = GetVarint32Ptr(p, limit, &len);
-  if (p == nullptr) return nullptr;
-  if (p + len > limit) return nullptr;
-  *result = Slice(p, len);
-  return p + len;
-}
-
 inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
   uint32_t len = 0;
   if (GetVarint32(input, &len) && input->size() >= len) {
@@ -272,8 +263,8 @@ inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
 
 inline Slice GetLengthPrefixedSlice(const char* data) {
   uint32_t len = 0;
-  const char* p = data;
-  p = GetVarint32Ptr(p, p + 5, &len);  // +5: we assume "p" is not corrupted
+  // +5: we assume "data" is not corrupted
+  auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
   return Slice(p, len);
 }
 

From 7d991be400a89dcf28edbe529740c9db7eb7e026 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Fri, 24 Jan 2014 11:09:04 -0800
Subject: [PATCH 59/70] Some small refactorings on table_test

Summary:

Just revise some hard-to-read or unnecessarily verbose code.

Test Plan:

make check
---
 table/table_test.cc | 115 ++++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 58 deletions(-)

diff --git a/table/table_test.cc b/table/table_test.cc
index 972e07012..bf9cea0cc 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6,6 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
 #include <map>
 #include <string>
 #include <memory>
@@ -39,15 +40,12 @@
 namespace rocksdb {
 
 namespace {
+
 // Return reverse of "key".
 // Used to test non-lexicographic comparators.
-static std::string Reverse(const Slice& key) {
-  std::string str(key.ToString());
-  std::string rev("");
-  for (std::string::reverse_iterator rit = str.rbegin();
-       rit != str.rend(); ++rit) {
-    rev.push_back(*rit);
-  }
+std::string Reverse(const Slice& key) {
+  auto rev = key.ToString();
+  std::reverse(rev.begin(), rev.end());
   return rev;
 }
 
@@ -76,10 +74,10 @@ class ReverseKeyComparator : public Comparator {
     *key = Reverse(s);
   }
 };
-}  // namespace
-static ReverseKeyComparator reverse_key_comparator;
 
-static void Increment(const Comparator* cmp, std::string* key) {
+ReverseKeyComparator reverse_key_comparator;
+
+void Increment(const Comparator* cmp, std::string* key) {
   if (cmp == BytewiseComparator()) {
     key->push_back('\0');
   } else {
@@ -91,7 +89,6 @@ static void Increment(const Comparator* cmp, std::string* key) {
 }
 
 // An STL comparator that uses a Comparator
-namespace anon {
 struct STLLessThan {
   const Comparator* cmp;
 
@@ -101,6 +98,7 @@ struct STLLessThan {
     return cmp->Compare(Slice(a), Slice(b)) < 0;
   }
 };
+
 }  // namespace
 
 class StringSink: public WritableFile {
@@ -168,13 +166,13 @@ class StringSource: public RandomAccessFile {
   bool mmap_;
 };
 
-typedef std::map<std::string, std::string, anon::STLLessThan> KVMap;
+typedef std::map<std::string, std::string, STLLessThan> KVMap;
 
 // Helper class for tests to unify the interface between
 // BlockBuilder/TableBuilder and Block/Table.
 class Constructor {
  public:
-  explicit Constructor(const Comparator* cmp) : data_(anon::STLLessThan(cmp)) { }
+  explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) {}
   virtual ~Constructor() { }
 
   void Add(const std::string& key, const Slice& value) {
@@ -520,61 +518,62 @@ struct TestArgs {
   CompressionType compression;
 };
 
-
 static std::vector<TestArgs> GenerateArgList() {
-  std::vector<TestArgs> ret;
-  TestType test_type[6] = { BLOCK_BASED_TABLE_TEST,
-      PLAIN_TABLE_SEMI_FIXED_PREFIX, PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST,
-      MEMTABLE_TEST, DB_TEST };
-  int test_type_len = 6;
-  bool reverse_compare[2] = {false, true};
-  int reverse_compare_len = 2;
-  int restart_interval[3] = {16, 1, 1024};
-  int restart_interval_len = 3;
+  std::vector<TestArgs> test_args;
+  std::vector<TestType> test_types = {
+      BLOCK_BASED_TABLE_TEST,      PLAIN_TABLE_SEMI_FIXED_PREFIX,
+      PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST,
+      MEMTABLE_TEST,               DB_TEST};
+  std::vector<bool> reverse_compare_types = {false, true};
+  std::vector<int> restart_intervals = {16, 1, 1024};
 
   // Only add compression if it is supported
-  std::vector<CompressionType> compression_types;
-  compression_types.push_back(kNoCompression);
+  std::vector<CompressionType> compression_types = {kNoCompression};
 #ifdef SNAPPY
-  if (SnappyCompressionSupported())
+  if (SnappyCompressionSupported()) {
     compression_types.push_back(kSnappyCompression);
+  }
 #endif
 
 #ifdef ZLIB
-  if (ZlibCompressionSupported())
+  if (ZlibCompressionSupported()) {
     compression_types.push_back(kZlibCompression);
+  }
 #endif
 
 #ifdef BZIP2
-  if (BZip2CompressionSupported())
+  if (BZip2CompressionSupported()) {
     compression_types.push_back(kBZip2Compression);
+  }
 #endif
 
-  for(int i =0; i < test_type_len; i++)
-    for (int j =0; j < reverse_compare_len; j++) {
-      if (test_type[i] == PLAIN_TABLE_SEMI_FIXED_PREFIX
-          || test_type[i] == PLAIN_TABLE_FULL_STR_PREFIX) {
+  for (auto test_type : test_types) {
+    for (auto reverse_compare : reverse_compare_types) {
+      if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
+          test_type == PLAIN_TABLE_FULL_STR_PREFIX) {
         // Plain table doesn't use restart index or compression.
         TestArgs one_arg;
-        one_arg.type = test_type[i];
-        one_arg.reverse_compare = reverse_compare[0];
-        one_arg.restart_interval = restart_interval[0];
+        one_arg.type = test_type;
+        one_arg.reverse_compare = reverse_compare;
+        one_arg.restart_interval = restart_intervals[0];
         one_arg.compression = compression_types[0];
-        ret.push_back(one_arg);
+        test_args.push_back(one_arg);
         continue;
       }
 
-      for (int k = 0; k < restart_interval_len; k++)
-        for (unsigned int n = 0; n < compression_types.size(); n++) {
+      for (auto restart_interval : restart_intervals) {
+        for (auto compression_type : compression_types) {
           TestArgs one_arg;
-          one_arg.type = test_type[i];
-          one_arg.reverse_compare = reverse_compare[j];
-          one_arg.restart_interval = restart_interval[k];
-          one_arg.compression = compression_types[n];
-          ret.push_back(one_arg);
+          one_arg.type = test_type;
+          one_arg.reverse_compare = reverse_compare;
+          one_arg.restart_interval = restart_interval;
+          one_arg.compression = compression_type;
+          test_args.push_back(one_arg);
         }
+      }
     }
-  return ret;
+  }
+  return test_args;
 }
 
 // In order to make all tests run for plain table format, including
@@ -1245,7 +1244,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
 
 }
 
-static void Do_Compression_Test(CompressionType comp) {
+static void DoCompressionTest(CompressionType comp) {
   Random rnd(301);
   TableConstructor c(BytewiseComparator());
   std::string tmp;
@@ -1287,7 +1286,7 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
 
   for(int i =0; i < valid; i++)
   {
-    Do_Compression_Test(compression_state[i]);
+    DoCompressionTest(compression_state[i]);
   }
 
 }
@@ -1407,9 +1406,9 @@ TEST(MemTableTest, Simple) {
 
 // Test the empty key
 TEST(Harness, SimpleEmptyKey) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
     Random rnd(test::RandomSeed() + 1);
     Add("", "v");
     Test(&rnd);
@@ -1417,9 +1416,9 @@ TEST(Harness, SimpleEmptyKey) {
 }
 
 TEST(Harness, SimpleSingle) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
     Random rnd(test::RandomSeed() + 2);
     Add("abc", "v");
     Test(&rnd);
@@ -1427,9 +1426,9 @@ TEST(Harness, SimpleSingle) {
 }
 
 TEST(Harness, SimpleMulti) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
     Random rnd(test::RandomSeed() + 3);
     Add("abc", "v");
     Add("abcd", "v");
@@ -1439,9 +1438,9 @@ TEST(Harness, SimpleMulti) {
 }
 
 TEST(Harness, SimpleSpecialKey) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
     Random rnd(test::RandomSeed() + 4);
     Add("\xff\xff", "v3");
     Test(&rnd);

From 0ab766132bf20d5dc0d17373160d1d72d5f8dfe2 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Fri, 24 Jan 2014 12:14:08 -0800
Subject: [PATCH 60/70] Re-org the table tests

Summary:
We'll divide the table tests into 3 buckets, plain table test, block-based table test and general table feature test.
This diff does no real change and only does the rename and reorg.

Test Plan: run table_test

Reviewers: sdong, haobo, igor, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15417
---
 table/table_test.cc | 176 ++++++++++++++++++++++----------------------
 1 file changed, 89 insertions(+), 87 deletions(-)

diff --git a/table/table_test.cc b/table/table_test.cc
index bf9cea0cc..44c905816 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -873,11 +873,14 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   return result;
 }
 
-class TableTest { };
+// Tests against all kinds of tables
+class GeneralTableTest { };
+class BlockBasedTableTest { };
+class PlainTableTest { };
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
-TEST(TableTest, BasicTableProperties) {
+TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   TableConstructor c(BytewiseComparator());
 
   c.Add("a1", "val1");
@@ -921,48 +924,7 @@ TEST(TableTest, BasicTableProperties) {
   );
 }
 
-extern const uint64_t kPlainTableMagicNumber;
-TEST(TableTest, BasicPlainTableProperties) {
-  PlainTableFactory factory(8, 8, 0);
-  StringSink sink;
-  std::unique_ptr<TableBuilder> builder(factory.GetTableBuilder(
-      Options(),
-      &sink,
-      kNoCompression
-  ));
-
-  for (char c = 'a'; c <= 'z'; ++c) {
-    std::string key(16, c);
-    std::string value(28, c + 42);
-    builder->Add(key, value);
-  }
-  ASSERT_OK(builder->Finish());
-
-  StringSource source(sink.contents(), 72242, true);
-
-  TableProperties props;
-  auto s = ReadTableProperties(
-      &source,
-      sink.contents().size(),
-      kPlainTableMagicNumber,
-      Env::Default(),
-      nullptr,
-      &props
-  );
-  ASSERT_OK(s);
-
-  ASSERT_EQ(0ul, props.index_size);
-  ASSERT_EQ(0ul, props.filter_size);
-  ASSERT_EQ(16ul * 26, props.raw_key_size);
-  ASSERT_EQ(28ul * 26, props.raw_value_size);
-  ASSERT_EQ(26ul, props.num_entries);
-  ASSERT_EQ(1ul, props.num_data_blocks);
-
-  // User collected keys
-  // internal keys
-}
-
-TEST(TableTest, FilterPolicyNameProperties) {
+TEST(BlockBasedTableTest, FilterPolicyNameProperties) {
   TableConstructor c(BytewiseComparator());
   c.Add("a1", "val1");
   std::vector<std::string> keys;
@@ -987,7 +949,7 @@ static std::string RandomString(Random* rnd, int len) {
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.
-TEST(TableTest, IndexSizeStat) {
+TEST(BlockBasedTableTest, IndexSizeStat) {
   uint64_t last_index_size = 0;
 
   // we need to use random keys since the pure human readable texts
@@ -1022,7 +984,7 @@ TEST(TableTest, IndexSizeStat) {
   }
 }
 
-TEST(TableTest, NumBlockStat) {
+TEST(BlockBasedTableTest, NumBlockStat) {
   Random rnd(test::RandomSeed());
   TableConstructor c(BytewiseComparator());
   Options options;
@@ -1085,7 +1047,7 @@ class BlockCacheProperties {
   long data_block_cache_hit = 0;
 };
 
-TEST(TableTest, BlockCacheTest) {
+TEST(BlockBasedTableTest, BlockCacheTest) {
   // -- Table construction
   Options options;
   options.create_if_missing = true;
@@ -1214,7 +1176,85 @@ TEST(TableTest, BlockCacheTest) {
   }
 }
 
-TEST(TableTest, ApproximateOffsetOfPlain) {
+TEST(BlockBasedTableTest, BlockCacheLeak) {
+  // Check that when we reopen a table we don't lose access to blocks already
+  // in the cache. This test checks whether the Table actually makes use of the
+  // unique ID from the file.
+
+  Options opt;
+  opt.block_size = 1024;
+  opt.compression = kNoCompression;
+  opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever
+                                               // lose cached values.
+
+  TableConstructor c(BytewiseComparator());
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  c.Finish(opt, &keys, &kvmap);
+
+  unique_ptr<Iterator> iter(c.NewIterator());
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->key();
+    iter->value();
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  ASSERT_OK(c.Reopen(opt));
+  for (const std::string& key: keys) {
+    ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key));
+  }
+}
+
+
+extern const uint64_t kPlainTableMagicNumber;
+TEST(PlainTableTest, BasicPlainTableProperties) {
+  PlainTableFactory factory(8, 8, 0);
+  StringSink sink;
+  std::unique_ptr<TableBuilder> builder(factory.GetTableBuilder(
+      Options(),
+      &sink,
+      kNoCompression
+  ));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(16, c);
+    std::string value(28, c + 42);
+    builder->Add(key, value);
+  }
+  ASSERT_OK(builder->Finish());
+
+  StringSource source(sink.contents(), 72242, true);
+
+  TableProperties props;
+  auto s = ReadTableProperties(
+      &source,
+      sink.contents().size(),
+      kPlainTableMagicNumber,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  ASSERT_OK(s);
+
+  ASSERT_EQ(0ul, props.index_size);
+  ASSERT_EQ(0ul, props.filter_size);
+  ASSERT_EQ(16ul * 26, props.raw_key_size);
+  ASSERT_EQ(28ul * 26, props.raw_value_size);
+  ASSERT_EQ(26ul, props.num_entries);
+  ASSERT_EQ(1ul, props.num_data_blocks);
+}
+
+
+TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
   TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");
   c.Add("k02", "hello2");
@@ -1267,7 +1307,7 @@ static void DoCompressionTest(CompressionType comp) {
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6100));
 }
 
-TEST(TableTest, ApproximateOffsetOfCompressed) {
+TEST(GeneralTableTest, ApproximateOffsetOfCompressed) {
   CompressionType compression_state[2];
   int valid = 0;
   if (!SnappyCompressionSupported()) {
@@ -1291,44 +1331,6 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
 
 }
 
-TEST(TableTest, BlockCacheLeak) {
-  // Check that when we reopen a table we don't lose access to blocks already
-  // in the cache. This test checks whether the Table actually makes use of the
-  // unique ID from the file.
-
-  Options opt;
-  opt.block_size = 1024;
-  opt.compression = kNoCompression;
-  opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever
-                                               // lose cached values.
-
-  TableConstructor c(BytewiseComparator());
-  c.Add("k01", "hello");
-  c.Add("k02", "hello2");
-  c.Add("k03", std::string(10000, 'x'));
-  c.Add("k04", std::string(200000, 'x'));
-  c.Add("k05", std::string(300000, 'x'));
-  c.Add("k06", "hello3");
-  c.Add("k07", std::string(100000, 'x'));
-  std::vector<std::string> keys;
-  KVMap kvmap;
-  c.Finish(opt, &keys, &kvmap);
-
-  unique_ptr<Iterator> iter(c.NewIterator());
-  iter->SeekToFirst();
-  while (iter->Valid()) {
-    iter->key();
-    iter->value();
-    iter->Next();
-  }
-  ASSERT_OK(iter->status());
-
-  ASSERT_OK(c.Reopen(opt));
-  for (const std::string& key: keys) {
-    ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key));
-  }
-}
-
 TEST(Harness, Randomized) {
   std::vector<TestArgs> args = GenerateArgList();
   for (unsigned int i = 0; i < args.size(); i++) {

From 4b51dffcf853f25ecab9102c9babed1d1312af69 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Fri, 24 Jan 2014 21:10:19 -0800
Subject: [PATCH 61/70] Some refactorings on plain table

Summary:
Plain table has been working well and this is just a nit-picking patch,
which is generated during my coding reading. No real functional changes.
only some changes regarding:

* Improve some comments from the perspective a "new" code reader.
* Change some magic number to constant, which can help us to parameterize them
  in the future.
* Did some style, naming, C++ convention changes.
* Fix warnings from new "arc lint"

Test Plan: make check

Reviewers: sdong, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15429
---
 table/plain_table_reader.cc | 244 ++++++++++++++++++------------------
 table/plain_table_reader.h  | 129 ++++++++++---------
 table/table_test.cc         |  33 ++---
 3 files changed, 202 insertions(+), 204 deletions(-)

diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 2e3838547..ab2e90c25 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -34,29 +34,66 @@
 
 namespace rocksdb {
 
-extern const uint64_t kPlainTableMagicNumber;
+namespace {
 
-static uint32_t GetSliceHash(Slice const& s) {
+inline uint32_t GetSliceHash(Slice const& s) {
   return Hash(s.data(), s.size(), 397) ;
 }
-static uint32_t getBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
   return hash % num_buckets;
 }
 
+}  // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public Iterator {
+ public:
+  explicit PlainTableIterator(PlainTableReader* table);
+  ~PlainTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+ private:
+  PlainTableReader* table_;
+  uint32_t offset_;
+  uint32_t next_offset_;
+  Slice key_;
+  Slice value_;
+  Status status_;
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
 PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
                                    uint64_t file_size, int bloom_bits_per_key,
                                    double hash_table_ratio,
-                                   const TableProperties& table_properties) :
-    hash_table_size_(0), soptions_(storage_options), file_size_(file_size),
-    hash_table_ratio_(hash_table_ratio),
-    bloom_bits_per_key_(bloom_bits_per_key),
-    table_properties_(table_properties), data_start_offset_(0),
-    data_end_offset_(table_properties_.data_size),
-    user_key_len_(table_properties.fixed_key_len) {
-  hash_table_ = nullptr;
-  bloom_ = nullptr;
-  sub_index_ = nullptr;
-}
+                                   const TableProperties& table_properties)
+    : soptions_(storage_options),
+      file_size_(file_size),
+      kHashTableRatio(hash_table_ratio),
+      kBloomBitsPerKey(bloom_bits_per_key),
+      table_properties_(table_properties),
+      data_end_offset_(table_properties_.data_size),
+      user_key_len_(table_properties.fixed_key_len) {}
 
 PlainTableReader::~PlainTableReader() {
   delete[] hash_table_;
@@ -73,30 +110,20 @@ Status PlainTableReader::Open(const Options& options,
                               double hash_table_ratio) {
   assert(options.allow_mmap_reads);
 
-  if (file_size > 2147483646) {
+  if (file_size > kMaxFileSize) {
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
   TableProperties table_properties;
-  auto s = ReadTableProperties(
-      file.get(),
-      file_size,
-      kPlainTableMagicNumber,
-      options.env,
-      options.info_log.get(),
-      &table_properties
-  );
+  auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                               options.env, options.info_log.get(),
+                               &table_properties);
   if (!s.ok()) {
     return s;
   }
 
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
-      soptions,
-      file_size,
-      bloom_num_bits,
-      hash_table_ratio,
-      table_properties
-  ));
+      soptions, file_size, bloom_num_bits, hash_table_ratio, table_properties));
   new_reader->file_ = std::move(file);
   new_reader->options_ = options;
 
@@ -129,12 +156,11 @@ struct PlainTableReader::IndexRecord {
 
 // Helper class to track all the index records
 class PlainTableReader::IndexRecordList {
-public:
-  explicit IndexRecordList(size_t num_records_per_group) :
-      num_records_per_group_(num_records_per_group),
-      current_group_(nullptr),
-      num_records_in_current_group_(num_records_per_group) {
-  }
+ public:
+  explicit IndexRecordList(size_t num_records_per_group)
+      : kNumRecordsPerGroup(num_records_per_group),
+        current_group_(nullptr),
+        num_records_in_current_group_(num_records_per_group) {}
 
   ~IndexRecordList() {
     for (size_t i = 0; i < groups_.size(); i++) {
@@ -143,65 +169,59 @@ public:
   }
 
   void AddRecord(murmur_t hash, uint32_t offset) {
-    if (num_records_in_current_group_ == num_records_per_group_) {
+    if (num_records_in_current_group_ == kNumRecordsPerGroup) {
       current_group_ = AllocateNewGroup();
       num_records_in_current_group_ = 0;
     }
-    auto& new_record = current_group_[num_records_in_current_group_];
+    auto& new_record = current_group_[num_records_in_current_group_++];
     new_record.hash = hash;
     new_record.offset = offset;
     new_record.next = nullptr;
-    num_records_in_current_group_++;
   }
 
-  size_t GetNumRecords() {
-    return (groups_.size() - 1) * num_records_per_group_
-        + num_records_in_current_group_;
+  size_t GetNumRecords() const {
+    return (groups_.size() - 1) * kNumRecordsPerGroup +
+           num_records_in_current_group_;
   }
   IndexRecord* At(size_t index) {
-    return &(groups_[index / num_records_per_group_]
-                    [index % num_records_per_group_]);
+    return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
   }
 
+ private:
   IndexRecord* AllocateNewGroup() {
-    IndexRecord* result = new IndexRecord[num_records_per_group_];
+    IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
     groups_.push_back(result);
     return result;
   }
-private:
-  const size_t num_records_per_group_;
+
+  const size_t kNumRecordsPerGroup;
   IndexRecord* current_group_;
   // List of arrays allocated
   std::vector<IndexRecord*> groups_;
   size_t num_records_in_current_group_;
 };
 
-int PlainTableReader::PopulateIndexRecordList(
-    IndexRecordList& record_list) {
-  Slice key_slice;
-  Slice key_prefix_slice;
-  Slice key_suffix_slice;
-  Slice value_slice;
+int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) {
   Slice prev_key_prefix_slice;
   uint32_t prev_key_prefix_hash = 0;
   uint32_t pos = data_start_offset_;
   int key_index_within_prefix = 0;
-  bool first = true;
-  std::string prefix_sub_index;
+  bool is_first_record = true;
   HistogramImpl keys_per_prefix_hist;
   // Need map to be ordered to make sure sub indexes generated
   // are in order.
 
   int num_prefixes = 0;
-
   while (pos < data_end_offset_) {
     uint32_t key_offset = pos;
+    Slice key_slice;
+    Slice value_slice;
     status_ = Next(pos, &key_slice, &value_slice, pos);
-    key_prefix_slice = GetPrefix(key_slice);
+    Slice key_prefix_slice = GetPrefix(key_slice);
 
-    if (first || prev_key_prefix_slice != key_prefix_slice) {
-      num_prefixes++;
-      if (!first) {
+    if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+      ++num_prefixes;
+      if (!is_first_record) {
         keys_per_prefix_hist.Add(key_index_within_prefix);
       }
       key_index_within_prefix = 0;
@@ -209,12 +229,13 @@ int PlainTableReader::PopulateIndexRecordList(
       prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
     }
 
-    if (key_index_within_prefix++ % 16 == 0) {
-      // Add an index key for every 16 keys
-      record_list.AddRecord(prev_key_prefix_hash, key_offset);
+    if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
+      // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+      record_list->AddRecord(prev_key_prefix_hash, key_offset);
     }
-    first = false;
+    is_first_record = false;
   }
+
   keys_per_prefix_hist.Add(key_index_within_prefix);
   Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
       keys_per_prefix_hist.ToString().c_str());
@@ -222,23 +243,22 @@ int PlainTableReader::PopulateIndexRecordList(
   return num_prefixes;
 }
 
-void PlainTableReader::Allocate(int num_prefixes) {
-  if (hash_table_ != nullptr) {
-    delete[] hash_table_;
-  }
-  if (bloom_bits_per_key_ > 0) {
-    bloom_ = new DynamicBloom(num_prefixes * bloom_bits_per_key_);
+void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
+  delete[] hash_table_;
+
+  if (kBloomBitsPerKey > 0) {
+    bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey);
   }
   double hash_table_size_multipier =
-      (hash_table_ratio_ > 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
+      (kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio;
   hash_table_size_ = num_prefixes * hash_table_size_multipier + 1;
   hash_table_ = new uint32_t[hash_table_size_];
 }
 
 size_t PlainTableReader::BucketizeIndexesAndFillBloom(
     IndexRecordList& record_list, int num_prefixes,
-    std::vector<IndexRecord*>& hash2offsets,
-    std::vector<uint32_t>& bucket_count) {
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* bucket_count) {
   size_t sub_index_size_needed = 0;
   bool first = true;
   uint32_t prev_hash = 0;
@@ -253,32 +273,34 @@ size_t PlainTableReader::BucketizeIndexesAndFillBloom(
         bloom_->AddHash(cur_hash);
       }
     }
-    uint32_t bucket = getBucketIdFromHash(cur_hash, hash_table_size_);
-    IndexRecord* prev_bucket_head = hash2offsets[bucket];
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
     index_record->next = prev_bucket_head;
-    hash2offsets[bucket] = index_record;
-    if (bucket_count[bucket] > 0) {
-      if (bucket_count[bucket] == 1) {
+    (*hash_to_offsets)[bucket] = index_record;
+    auto& item_count = (*bucket_count)[bucket];
+    if (item_count > 0) {
+      if (item_count == 1) {
         sub_index_size_needed += kOffsetLen + 1;
       }
-      if (bucket_count[bucket] == 127) {
+      if (item_count == 127) {
         // Need more than one byte for length
         sub_index_size_needed++;
       }
       sub_index_size_needed += kOffsetLen;
     }
-    bucket_count[bucket]++;
+    item_count++;
   }
   return sub_index_size_needed;
 }
 
-void PlainTableReader::FillIndexes(size_t sub_index_size_needed,
-                                   std::vector<IndexRecord*>& hash2offsets,
-                                   std::vector<uint32_t>& bucket_count) {
+void PlainTableReader::FillIndexes(
+    size_t sub_index_size_needed,
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& bucket_count) {
   Log(options_.info_log, "Reserving %zu bytes for sub index",
       sub_index_size_needed);
-  // 4 bytes buffer for variable length size
-  size_t buffer_size = 64;
+  // 8 bytes buffer for variable length size
+  size_t buffer_size = 8 * 8;
   size_t buffer_used = 0;
   sub_index_size_needed += buffer_size;
   sub_index_ = new char[sub_index_size_needed];
@@ -286,7 +308,6 @@ void PlainTableReader::FillIndexes(size_t sub_index_size_needed,
   char* prev_ptr;
   char* cur_ptr;
   uint32_t* sub_index_ptr;
-  IndexRecord* record;
   for (int i = 0; i < hash_table_size_; i++) {
     uint32_t num_keys_for_bucket = bucket_count[i];
     switch (num_keys_for_bucket) {
@@ -296,14 +317,14 @@ void PlainTableReader::FillIndexes(size_t sub_index_size_needed,
       break;
     case 1:
       // point directly to the file offset
-      hash_table_[i] = hash2offsets[i]->offset;
+      hash_table_[i] = hash_to_offsets[i]->offset;
       break;
     default:
       // point to second level indexes.
       hash_table_[i] = sub_index_offset | kSubIndexMask;
       prev_ptr = sub_index_ + sub_index_offset;
       cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
-      sub_index_offset += cur_ptr - prev_ptr;
+      sub_index_offset += (cur_ptr - prev_ptr);
       if (cur_ptr - prev_ptr > 2
           || (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) {
         // Need to resize sub_index. Exponentially grow buffer.
@@ -321,10 +342,10 @@ void PlainTableReader::FillIndexes(size_t sub_index_size_needed,
         }
       }
       sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset);
-      record = hash2offsets[i];
+      IndexRecord* record = hash_to_offsets[i];
       int j;
-      for (j = num_keys_for_bucket - 1;
-          j >= 0 && record; j--, record = record->next) {
+      for (j = num_keys_for_bucket - 1; j >= 0 && record;
+           j--, record = record->next) {
         sub_index_ptr[j] = record->offset;
       }
       assert(j == -1 && record == nullptr);
@@ -337,24 +358,6 @@ void PlainTableReader::FillIndexes(size_t sub_index_size_needed,
       hash_table_size_, sub_index_size_needed);
 }
 
-// PopulateIndex() builds index of keys.
-// hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
-// integer. The lower 31 bits contain an offset value (explained below) and
-// the first bit of the integer indicates type of the offset:
-//
-// 0 indicates that the bucket contains only one prefix (no conflict when
-//   hashing this prefix), whose first row starts from this offset of the file.
-// 1 indicates that the bucket contains more than one prefixes, or there
-//   are too many rows for one prefix so we need a binary search for it. In
-//   this case, the offset indicates the offset of sub_index_ holding the
-//   binary search indexes of keys for those rows. Those binary search indexes
-//   are organized in this way:
-//
-// The first 4 bytes, indicates how many indexes (N) are stored after it. After
-// it, there are N 32-bit integers, each points of an offset of the file, which
-// points to starting of a row. Those offsets need to be guaranteed to be in
-// ascending order so the keys they are pointing to are also in ascending order
-// to make sure we can use them to do binary searches.
 Status PlainTableReader::PopulateIndex() {
   // Get mmapped memory to file_data_.
   Status s = file_->Read(0, file_size_, &file_data_, nullptr);
@@ -362,25 +365,24 @@ Status PlainTableReader::PopulateIndex() {
     return s;
   }
 
-  IndexRecordList record_list(256);
-  // First, read the whole file, for every 16 rows for a prefix (starting from
-  // the first one), generate a record of (hash, offset) and append it to
-  // IndexRecordList, which is a data structure created to store them.
-  int num_prefixes = PopulateIndexRecordList(record_list);
+  IndexRecordList record_list(kRecordsPerGroup);
+  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+  // for a prefix (starting from the first one), generate a record of (hash,
+  // offset) and append it to IndexRecordList, which is a data structure created
+  // to store them.
+  int num_prefixes = PopulateIndexRecordList(&record_list);
   // Calculated hash table and bloom filter size and allocate memory for indexes
   // and bloom filter based on the number of prefixes.
-  Allocate(num_prefixes);
+  AllocateIndexAndBloom(num_prefixes);
 
   // Bucketize all the index records to a temp data structure, in which for
   // each bucket, we generate a linked list of IndexRecord, in reversed order.
-  std::vector<IndexRecord*> hash2offsets(hash_table_size_, nullptr);
+  std::vector<IndexRecord*> hash_to_offsets(hash_table_size_, nullptr);
   std::vector<uint32_t> bucket_count(hash_table_size_, 0);
-  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(record_list,
-                                                              num_prefixes,
-                                                              hash2offsets,
-                                                              bucket_count);
+  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
+      record_list, num_prefixes, &hash_to_offsets, &bucket_count);
   // From the temp data structure, populate indexes.
-  FillIndexes(sub_index_size_needed, hash2offsets, bucket_count);
+  FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count);
 
   return Status::OK();
 }
@@ -389,7 +391,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
                                    uint32_t prefix_hash, bool& prefix_matched,
                                    uint32_t& ret_offset) {
   prefix_matched = false;
-  int bucket = getBucketIdFromHash(prefix_hash, hash_table_size_);
+  int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_);
   uint32_t bucket_value = hash_table_[bucket];
   if (bucket_value == data_end_offset_) {
     ret_offset = data_end_offset_;
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 6d2efc7da..144c4686a 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -5,6 +5,7 @@
 #pragma once
 #include <unordered_map>
 #include <memory>
+#include <vector>
 #include <stdint.h>
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -35,7 +36,7 @@ using std::unordered_map;
 //
 // The implementation of IndexedTableReader requires output file is mmaped
 class PlainTableReader: public TableReader {
-public:
+ public:
   static Status Open(const Options& options, const EnvOptions& soptions,
                      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
                      unique_ptr<TableReader>* table, const int bloom_num_bits,
@@ -65,12 +66,12 @@ public:
                    const TableProperties& table_properties);
   ~PlainTableReader();
 
-private:
+ private:
   struct IndexRecord;
   class IndexRecordList;
 
   uint32_t* hash_table_ = nullptr;
-  int hash_table_size_;
+  int hash_table_size_ = 0;
   char* sub_index_ = nullptr;
 
   Options options_;
@@ -82,24 +83,30 @@ private:
   uint32_t version_;
   uint32_t file_size_;
 
-  const double hash_table_ratio_;
-  const int bloom_bits_per_key_;
-  DynamicBloom* bloom_;
+  const double kHashTableRatio;
+  const int kBloomBitsPerKey;
+  DynamicBloom* bloom_ = nullptr;
 
   TableProperties table_properties_;
-  const uint32_t data_start_offset_;
+  const uint32_t data_start_offset_ = 0;
   const uint32_t data_end_offset_;
   const size_t user_key_len_;
 
   static const size_t kNumInternalBytes = 8;
   static const uint32_t kSubIndexMask = 0x80000000;
   static const size_t kOffsetLen = sizeof(uint32_t);
-
-  bool IsFixedLength() {
+  static const uint64_t kMaxFileSize = 1u << 31;
+  static const size_t kRecordsPerGroup = 256;
+  // To speed up the search for keys with same prefix, we'll add index key for
+  // every N keys, where the "N" is determined by
+  // kIndexIntervalForSamePrefixKeys
+  static const size_t kIndexIntervalForSamePrefixKeys = 16;
+
+  bool IsFixedLength() const {
     return user_key_len_ != PlainTableFactory::kVariableLength;
   }
 
-  size_t GetFixedInternalKeyLength() {
+  size_t GetFixedInternalKeyLength() const {
     return user_key_len_ + kNumInternalBytes;
   }
 
@@ -108,32 +115,67 @@ private:
 
   // Internal helper function to generate an IndexRecordList object from all
   // the rows, which contains index records as a list.
-  int PopulateIndexRecordList(IndexRecordList& record_list);
+  int PopulateIndexRecordList(IndexRecordList* record_list);
 
   // Internal helper function to allocate memory for indexes and bloom filters
-  void Allocate(int num_prefixes);
+  void AllocateIndexAndBloom(int num_prefixes);
 
   // Internal helper function to bucket index record list to hash buckets.
-  // hash2offsets is sized of of hash_table_size_, each contains a linked list
+  // hash_to_offsets is sized of of hash_table_size_, each contains a linked
+  // list
   // of offsets for the hash, in reversed order.
   // bucket_count is sized of hash_table_size_. The value is how many index
-  // records are there in hash2offsets for the same bucket.
+  // records are there in hash_to_offsets for the same bucket.
   size_t BucketizeIndexesAndFillBloom(
       IndexRecordList& record_list, int num_prefixes,
-      std::vector<IndexRecord*>& hash2offsets,
-      std::vector<uint32_t>& bucket_count);
+      std::vector<IndexRecord*>* hash_to_offsets,
+      std::vector<uint32_t>* bucket_count);
 
   // Internal helper class to fill the indexes and bloom filters to internal
-  // data structures. hash2offsets and bucket_count are bucketized indexes and
-  // counts generated by BucketizeIndexesAndFillBloom().
+  // data structures. hash_to_offsets and bucket_count are bucketized indexes
+  // and counts generated by BucketizeIndexesAndFillBloom().
   void FillIndexes(size_t sub_index_size_needed,
-                   std::vector<IndexRecord*>& hash2offsets,
-                   std::vector<uint32_t>& bucket_count);
-
-  // Populate the internal indexes. It must be called before
-  // any query to the table.
-  // This query will populate the hash table hash_table_, the second
-  // level of indexes sub_index_ and bloom filter filter_slice_ if enabled.
+                   const std::vector<IndexRecord*>& hash_to_offsets,
+                   const std::vector<uint32_t>& bucket_count);
+
+  // PopulateIndex() builds index of keys. It must be called before any query
+  // to the table.
+  //
+  // hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
+  // integer. The lower 31 bits contain an offset value (explained below) and
+  // the first bit of the integer indicates type of the offset.
+  //
+  // +--------------+------------------------------------------------------+
+  // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+  // +--------------+------------------------------------------------------+
+  //
+  // Explanation for the "flag bit":
+  //
+  // 0 indicates that the bucket contains only one prefix (no conflict when
+  //   hashing this prefix), whose first row starts from this offset of the
+  // file.
+  // 1 indicates that the bucket contains more than one prefixes, or there
+  //   are too many rows for one prefix so we need a binary search for it. In
+  //   this case, the offset indicates the offset of sub_index_ holding the
+  //   binary search indexes of keys for those rows. Those binary search indexes
+  //   are organized in this way:
+  //
+  // The first 4 bytes, indicate how many indexes (N) are stored after it. After
+  // it, there are N 32-bit integers, each points of an offset of the file,
+  // which
+  // points to starting of a row. Those offsets need to be guaranteed to be in
+  // ascending order so the keys they are pointing to are also in ascending
+  // order
+  // to make sure we can use them to do binary searches. Below is visual
+  // presentation of a bucket.
+  //
+  // <begin>
+  //   number_of_records:  varint32
+  //   record 1 file offset:  fixedint32
+  //   record 2 file offset:  fixedint32
+  //    ....
+  //   record N file offset:  fixedint32
+  // <end>
   Status PopulateIndex();
 
   // Check bloom filter to see whether it might contain this prefix.
@@ -163,41 +205,4 @@ private:
   explicit PlainTableReader(const TableReader&) = delete;
   void operator=(const TableReader&) = delete;
 };
-
-// Iterator to iterate IndexedTable
-class PlainTableIterator: public Iterator {
-public:
-  explicit PlainTableIterator(PlainTableReader* table);
-  ~PlainTableIterator();
-
-  bool Valid() const;
-
-  void SeekToFirst();
-
-  void SeekToLast();
-
-  void Seek(const Slice& target);
-
-  void Next();
-
-  void Prev();
-
-  Slice key() const;
-
-  Slice value() const;
-
-  Status status() const;
-
-private:
-  PlainTableReader* table_;
-  uint32_t offset_;
-  uint32_t next_offset_;
-  Slice key_;
-  Slice value_;
-  Status status_;
-  // No copying allowed
-  PlainTableIterator(const PlainTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
-};
-
 }  // namespace rocksdb
diff --git a/table/table_test.cc b/table/table_test.cc
index 44c905816..9b4d6d808 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -874,9 +874,9 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
 }
 
 // Tests against all kinds of tables
-class GeneralTableTest { };
-class BlockBasedTableTest { };
-class PlainTableTest { };
+class GeneralTableTest {};
+class BlockBasedTableTest {};
+class PlainTableTest {};
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
@@ -1184,8 +1184,9 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   Options opt;
   opt.block_size = 1024;
   opt.compression = kNoCompression;
-  opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever
-                                               // lose cached values.
+  opt.block_cache =
+      NewLRUCache(16 * 1024 * 1024);  // big enough so we don't ever
+                                      // lose cached values.
 
   TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");
@@ -1209,21 +1210,17 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   ASSERT_OK(iter->status());
 
   ASSERT_OK(c.Reopen(opt));
-  for (const std::string& key: keys) {
+  for (const std::string& key : keys) {
     ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key));
   }
 }
 
-
 extern const uint64_t kPlainTableMagicNumber;
 TEST(PlainTableTest, BasicPlainTableProperties) {
   PlainTableFactory factory(8, 8, 0);
   StringSink sink;
-  std::unique_ptr<TableBuilder> builder(factory.GetTableBuilder(
-      Options(),
-      &sink,
-      kNoCompression
-  ));
+  std::unique_ptr<TableBuilder> builder(
+      factory.GetTableBuilder(Options(), &sink, kNoCompression));
 
   for (char c = 'a'; c <= 'z'; ++c) {
     std::string key(16, c);
@@ -1235,14 +1232,9 @@ TEST(PlainTableTest, BasicPlainTableProperties) {
   StringSource source(sink.contents(), 72242, true);
 
   TableProperties props;
-  auto s = ReadTableProperties(
-      &source,
-      sink.contents().size(),
-      kPlainTableMagicNumber,
-      Env::Default(),
-      nullptr,
-      &props
-  );
+  auto s = ReadTableProperties(&source, sink.contents().size(),
+                               kPlainTableMagicNumber, Env::Default(), nullptr,
+                               &props);
   ASSERT_OK(s);
 
   ASSERT_EQ(0ul, props.index_size);
@@ -1253,7 +1245,6 @@ TEST(PlainTableTest, BasicPlainTableProperties) {
   ASSERT_EQ(1ul, props.num_data_blocks);
 }
 
-
 TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
   TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");

From b20486f29469b005ac0b07c7fb1723faadf6c877 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 24 Jan 2014 17:50:59 -0800
Subject: [PATCH 62/70] [Performance Branch] HashLinkList to avoid to convert
 length prefixed string back to internal keys

Summary: Converting from length prefixed buffer back to internal key costs some CPU but it is not necessary. In this patch, internal keys are pass though the functions so that we don't need to convert back to it.

Test Plan: make all check

Reviewers: haobo, kailiu

Reviewed By: kailiu

CC: igor, leveldb

Differential Revision: https://reviews.facebook.net/D15393
---
 db/memtable.cc                | 24 ++++++++++++-------
 db/memtable.h                 |  5 +++-
 include/rocksdb/memtablerep.h |  6 ++++-
 util/hash_linklist_rep.cc     | 45 +++++++++++++++++++++++------------
 util/hash_skiplist_rep.cc     |  5 ++--
 5 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index e0e2a5c2f..deb3f7ad7 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -65,12 +65,20 @@ size_t MemTable::ApproximateMemoryUsage() {
          table_->ApproximateMemoryUsage();
 }
 
-int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+                                        const char* prefix_len_key2) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+  return comparator.Compare(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key,
+                                        const Slice& key)
     const {
   // Internal keys are encoded as length-prefixed strings.
-  Slice a = GetLengthPrefixedSlice(aptr);
-  Slice b = GetLengthPrefixedSlice(bptr);
-  return comparator.Compare(a, b);
+  Slice a = GetLengthPrefixedSlice(prefix_len_key);
+  return comparator.Compare(a, key);
 }
 
 Slice MemTableRep::UserKey(const char* key) const {
@@ -213,7 +221,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     // iter is null if prefix bloom says the key does not exist
   } else {
     iter.reset(table_->GetIterator(user_key));
-    iter->Seek(user_key, mem_key.data());
+    iter->Seek(key.internal_key(), mem_key.data());
   }
 
   bool merge_in_progress = s->IsMergeInProgress();
@@ -325,7 +333,7 @@ void MemTable::Update(SequenceNumber seq,
 
   std::unique_ptr<MemTableRep::Iterator> iter(
     table_->GetIterator(lkey.user_key()));
-  iter->Seek(key, mem_key.data());
+  iter->Seek(lkey.internal_key(), mem_key.data());
 
   if (iter->Valid()) {
     // entry format is:
@@ -389,7 +397,7 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
 
   std::shared_ptr<MemTableRep::Iterator> iter(
     table_->GetIterator(lkey.user_key()));
-  iter->Seek(key, memkey.data());
+  iter->Seek(lkey.internal_key(), memkey.data());
 
   if (iter->Valid()) {
     // entry format is:
@@ -461,7 +469,7 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
   // The iterator only needs to be ordered within the same user key.
   std::unique_ptr<MemTableRep::Iterator> iter(
       table_->GetIterator(key.user_key()));
-  iter->Seek(key.user_key(), memkey.data());
+  iter->Seek(key.internal_key(), memkey.data());
 
   size_t num_successive_merges = 0;
 
diff --git a/db/memtable.h b/db/memtable.h
index 5e7eeb4a1..aca4aaf16 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -30,7 +30,10 @@ class MemTable {
   struct KeyComparator : public MemTableRep::KeyComparator {
     const InternalKeyComparator comparator;
     explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
-    virtual int operator()(const char* a, const char* b) const;
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const;
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const override;
   };
 
   // MemTables are reference counted.  The initial reference count
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 1a16a30cc..3cb03c7fc 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -51,7 +51,11 @@ class MemTableRep {
    public:
     // Compare a and b. Return a negative value if a is less than b, 0 if they
     // are equal, and a positive value if a is greater than b
-    virtual int operator()(const char* a, const char* b) const = 0;
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const = 0;
+
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const = 0;
 
     virtual ~KeyComparator() { }
   };
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index 9c2d8f52c..844907a28 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -92,7 +92,11 @@ class HashLinkListRep : public MemTableRep {
   // immutable after construction
   Arena* const arena_;
 
-  bool BucketContains(Node* head, const Key& key) const;
+  bool BucketContains(Node* head, const Slice& key) const;
+
+  Slice GetPrefix(const Slice& internal_key) const {
+    return transform_->Transform(ExtractUserKey(internal_key));
+  }
 
   size_t GetHash(const Slice& slice) const {
     return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
@@ -111,14 +115,25 @@ class HashLinkListRep : public MemTableRep {
     return new (mem) Node(key);
   }
 
+  bool Equal(const Slice& a, const Key& b) const {
+    return (compare_(b, a) == 0);
+  }
+
+
   bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
 
+  bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, internal_key) < 0);
+  }
+
   bool KeyIsAfterNode(const Key& key, const Node* n) const {
     // nullptr n is considered infinite
     return (n != nullptr) && (compare_(n->key, key) < 0);
   }
 
-  Node* FindGreaterOrEqualInBucket(Node* head, const Key& key) const;
+
+  Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
 
   class FullListIterator : public MemTableRep::Iterator {
    public:
@@ -219,11 +234,8 @@ class HashLinkListRep : public MemTableRep {
 
     // Advance to the first entry with a key >= target
     virtual void Seek(const Slice& internal_key, const char* memtable_key) {
-      const char* encoded_key =
-          (memtable_key != nullptr) ?
-              memtable_key : EncodeKey(&tmp_, internal_key);
       node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
-                                                              encoded_key);
+                                                              internal_key);
     }
 
     // Position at the first entry in collection.
@@ -267,7 +279,7 @@ class HashLinkListRep : public MemTableRep {
 
     // Advance to the first entry with a key >= target
     virtual void Seek(const Slice& k, const char* memtable_key) {
-      auto transformed = memtable_rep_.transform_->Transform(k);
+      auto transformed = memtable_rep_.GetPrefix(k);
       Reset(memtable_rep_.GetBucket(transformed));
       HashLinkListRep::Iterator::Seek(k, memtable_key);
     }
@@ -320,7 +332,8 @@ HashLinkListRep::~HashLinkListRep() {
 
 void HashLinkListRep::Insert(const char* key) {
   assert(!Contains(key));
-  auto transformed = transform_->Transform(UserKey(key));
+  Slice internal_key = GetLengthPrefixedSlice(key);
+  auto transformed = GetPrefix(internal_key);
   auto& bucket = buckets_[GetHash(transformed)];
   Node* head = static_cast<Node*>(bucket.Acquire_Load());
 
@@ -344,7 +357,7 @@ void HashLinkListRep::Insert(const char* key) {
     // If x points to head_ or next points nullptr, it is trivially satisfied.
     assert((cur == head) || (next == nullptr) ||
            KeyIsAfterNode(next->key, cur));
-    if (KeyIsAfterNode(key, cur)) {
+    if (KeyIsAfterNode(internal_key, cur)) {
       // Keep searching in this list
       prev = cur;
       cur = next;
@@ -370,12 +383,14 @@ void HashLinkListRep::Insert(const char* key) {
 }
 
 bool HashLinkListRep::Contains(const char* key) const {
-  auto transformed = transform_->Transform(UserKey(key));
+  Slice internal_key = GetLengthPrefixedSlice(key);
+
+  auto transformed = GetPrefix(internal_key);
   auto bucket = GetBucket(transformed);
   if (bucket == nullptr) {
     return false;
   }
-  return BucketContains(bucket, key);
+  return BucketContains(bucket, internal_key);
 }
 
 size_t HashLinkListRep::ApproximateMemoryUsage() {
@@ -414,13 +429,13 @@ MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() {
   return new DynamicIterator(*this);
 }
 
-bool HashLinkListRep::BucketContains(Node* head, const Key& key) const {
-  Node* x = FindGreaterOrEqualInBucket(head, key);
-  return (x != nullptr && Equal(key, x->key));
+bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const {
+  Node* x = FindGreaterOrEqualInBucket(head, user_key);
+  return (x != nullptr && Equal(user_key, x->key));
 }
 
 Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
-                                                  const Key& key) const {
+                                                  const Slice& key) const {
   Node* x = head;
   while (true) {
     if (x == nullptr) {
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index 906f83030..845137a4c 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -170,7 +170,7 @@ class HashSkipListRep : public MemTableRep {
 
     // Advance to the first entry with a key >= target
     virtual void Seek(const Slice& k, const char* memtable_key) {
-      auto transformed = memtable_rep_.transform_->Transform(k);
+      auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
       Reset(memtable_rep_.GetBucket(transformed));
       HashSkipListRep::Iterator::Seek(k, memtable_key);
     }
@@ -209,7 +209,8 @@ class HashSkipListRep : public MemTableRep {
     }
     virtual void Next() { }
     virtual void Prev() { }
-    virtual void Seek(const Slice& user_key, const char* memtable_key) { }
+    virtual void Seek(const Slice& internal_key,
+                      const char* memtable_key) { }
     virtual void SeekToFirst() { }
     virtual void SeekToLast() { }
    private:

From 3170abd297fd63fce3cddc44dfb926f4ffb32844 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Wed, 29 Jan 2014 16:39:05 -0800
Subject: [PATCH 63/70] Remove unused classes

Summary: This is a followup diff for https://reviews.facebook.net/D15447, which picks the most simple task: delete some unused memtable reps.

Test Plan: make

Reviewers: haobo, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15585
---
 include/rocksdb/memtablerep.h | 81 +----------------------------------
 1 file changed, 1 insertion(+), 80 deletions(-)

diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 15906ceed..d5641cb78 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -180,86 +180,7 @@ public:
   }
 };
 
-// HashSkipListRep is backed by hash map of buckets. Each bucket is a skip
-// list. All the keys with the same prefix will be in the same bucket.
-// The prefix is determined using user supplied SliceTransform. It has
-// to match prefix_extractor in options.prefix_extractor.
-//
-// Iteration over the entire collection is implemented by dumping all the keys
-// into a separate skip list. Thus, these data structures are best used when
-// iteration over the entire collection is rare.
-//
-// Parameters:
-//   transform: The SliceTransform to bucket user keys on. TransformRepFactory
-//     owns the pointer.
-//   bucket_count: Passed to the constructor of the underlying
-//     std::unordered_map of each TransformRep. On initialization, the
-//     underlying array will be at least bucket_count size.
-//   num_locks: Number of read-write locks to have for the rep. Each bucket is
-//     hashed onto a read-write lock which controls access to that lock. More
-//     locks means finer-grained concurrency but more memory overhead.
-class TransformRepFactory : public MemTableRepFactory {
- public:
-  explicit TransformRepFactory(const SliceTransform* transform,
-    size_t bucket_count, size_t num_locks = 1000)
-    : transform_(transform),
-      bucket_count_(bucket_count),
-      num_locks_(num_locks) { }
-
-  virtual ~TransformRepFactory() { delete transform_; }
-
-  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
-                                         Arena*) override;
-
-  virtual const char* Name() const override {
-    return "TransformRepFactory";
-  }
-
-  const SliceTransform* GetTransform() { return transform_; }
-
- protected:
-  const SliceTransform* transform_;
-  const size_t bucket_count_;
-  const size_t num_locks_;
-};
-
-// UnsortedReps bin user keys based on an identity function transform -- that
-// is, transform(key) = key. This optimizes for point look-ups.
-//
-// Parameters: See TransformRepFactory.
-class UnsortedRepFactory : public TransformRepFactory {
-public:
-  explicit UnsortedRepFactory(size_t bucket_count = 0, size_t num_locks = 1000)
-    : TransformRepFactory(NewNoopTransform(),
-                          bucket_count,
-                          num_locks) { }
-  virtual const char* Name() const override {
-    return "UnsortedRepFactory";
-  }
-};
-
-// PrefixHashReps bin user keys based on a fixed-size prefix. This optimizes for
-// short ranged scans over a given prefix.
-//
-// Parameters: See TransformRepFactory.
-class PrefixHashRepFactory : public TransformRepFactory {
-public:
-  explicit PrefixHashRepFactory(const SliceTransform* prefix_extractor,
-    size_t bucket_count = 0, size_t num_locks = 1000)
-    : TransformRepFactory(prefix_extractor, bucket_count, num_locks)
-    { }
-
-  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
-                                         Arena*) override;
-
-  virtual const char* Name() const override {
-    return "PrefixHashRepFactory";
-  }
-};
-
-// The same as TransformRepFactory except it doesn't use locks.
-// Experimental, will replace TransformRepFactory once we are sure
-// it performs better. It contains a fixed array of buckets, each
+// This class contains a fixed array of buckets, each
 // pointing to a skiplist (null if the bucket is empty).
 // bucket_count: number of fixed array buckets
 // skiplist_height: the max height of the skiplist

From a46ac92138eb074620edad637a279a25d2dcbb6a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 27 Jan 2014 12:45:08 -0800
Subject: [PATCH 64/70] Allow command line tool sst-dump to display table
 properties.

Summary:
Add option '--show_properties' to sst_dump tool to allow displaying
property block of the specified files.

Test Plan:
Run sst_dump with the following arguments, which covers cases affected by
this diff:

  1. with only --file
  2. with both --file and --show_properties
  3. with --file, --show_properties, and --from

Reviewers: kailiu, xjin

Differential Revision: https://reviews.facebook.net/D15453
---
 tools/sst_dump.cc | 127 ++++++++++++++++++++++++++++++----------------
 1 file changed, 83 insertions(+), 44 deletions(-)

diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 903889556..ba586aca1 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -15,6 +15,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
@@ -38,22 +39,51 @@ class SstFileReader {
                         bool has_to,
                         const std::string& to_key);
 
+  Status ReadTableProperties(TableProperties* table_properties);
   uint64_t GetReadNumber() { return read_num_; }
 
-private:
+ private:
+  Status NewTableReader(const std::string& file_path);
+
   std::string file_name_;
   uint64_t read_num_;
   bool verify_checksum_;
   bool output_hex_;
   EnvOptions soptions_;
+
+  Status init_result_;
+  unique_ptr<TableReader> table_reader_;
+  unique_ptr<RandomAccessFile> file_;
+  // table_options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options table_options_;
+  InternalKeyComparator internal_comparator_;
 };
 
 SstFileReader::SstFileReader(const std::string& file_path,
                              bool verify_checksum,
                              bool output_hex)
- :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
-  output_hex_(output_hex) {
-  std::cout << "Process " << file_path << "\n";
+    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
+    output_hex_(output_hex), internal_comparator_(BytewiseComparator()) {
+  fprintf(stdout, "Process %s\n", file_path.c_str());
+
+  init_result_ = NewTableReader(file_name_);
+}
+
+Status SstFileReader::NewTableReader(const std::string& file_path) {
+  table_options_.comparator = &internal_comparator_;
+  Status s = table_options_.env->NewRandomAccessFile(file_path, &file_,
+                                                    soptions_);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t file_size;
+  table_options_.env->GetFileSize(file_path, &file_size);
+  unique_ptr<TableFactory> table_factory;
+  s = table_options_.table_factory->GetTableReader(table_options_, soptions_,
+                                                  std::move(file_), file_size,
+                                                  &table_reader_);
+  return s;
 }
 
 Status SstFileReader::ReadSequential(bool print_kv,
@@ -61,29 +91,12 @@ Status SstFileReader::ReadSequential(bool print_kv,
                                      bool has_from,
                                      const std::string& from_key,
                                      bool has_to,
-                                     const std::string& to_key)
-{
-  unique_ptr<TableReader> table_reader;
-  InternalKeyComparator internal_comparator_(BytewiseComparator());
-  Options table_options;
-  table_options.comparator = &internal_comparator_;
-  unique_ptr<RandomAccessFile> file;
-  Status s = table_options.env->NewRandomAccessFile(file_name_, &file,
-                                                    soptions_);
-  if(!s.ok()) {
-   return s;
-  }
-  uint64_t file_size;
-  table_options.env->GetFileSize(file_name_, &file_size);
-  unique_ptr<TableFactory> table_factory;
-  s = table_options.table_factory->GetTableReader(table_options, soptions_,
-                                                  std::move(file), file_size,
-                                                  &table_reader);
-  if(!s.ok()) {
-   return s;
+                                     const std::string& to_key) {
+  if (!table_reader_) {
+    return init_result_;
   }
 
-  Iterator* iter = table_reader->NewIterator(ReadOptions(verify_checksum_,
+  Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
                                                          false));
   uint64_t i = 0;
   if (has_from) {
@@ -113,21 +126,29 @@ Status SstFileReader::ReadSequential(bool print_kv,
     }
 
     if (print_kv) {
-      std::cout << ikey.DebugString(output_hex_)
-                << " => "
-                << value.ToString(output_hex_) << "\n";
+      fprintf(stdout, "%s => %s\n",
+          ikey.DebugString(output_hex_).c_str(),
+          value.ToString(output_hex_).c_str());
     }
+  }
 
-   }
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
 
-   read_num_ += i;
+Status SstFileReader::ReadTableProperties(TableProperties* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
 
-   Status ret = iter->status();
-   delete iter;
-   return ret;
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
 }
 
-} // namespace rocksdb
+}  // namespace rocksdb
 
 static void print_help() {
   fprintf(stderr,
@@ -137,7 +158,8 @@ static void print_help() {
       " [--input_key_hex]"
       " [--from=<user_key>]"
       " [--to=<user_key>]"
-      " [--read_num=NUM]\n");
+      " [--read_num=NUM]"
+      " [--show_properties]\n");
 }
 
 string HexToString(const string& str) {
@@ -158,7 +180,6 @@ string HexToString(const string& str) {
 }
 
 int main(int argc, char** argv) {
-
   const char* dir_or_file = nullptr;
   uint64_t read_num = -1;
   std::string command;
@@ -170,10 +191,10 @@ int main(int argc, char** argv) {
   bool input_key_hex = false;
   bool has_from = false;
   bool has_to = false;
+  bool show_properties = false;
   std::string from_key;
   std::string to_key;
-  for (int i = 1; i < argc; i++)
-  {
+  for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--file=", 7) == 0) {
       dir_or_file = argv[i] + 7;
     } else if (strcmp(argv[i], "--output_hex") == 0) {
@@ -194,7 +215,9 @@ int main(int argc, char** argv) {
     } else if (strncmp(argv[i], "--to=", 5) == 0) {
       to_key = argv[i] + 5;
       has_to = true;
-    }else {
+    } else if (strcmp(argv[i], "--show_properties") == 0) {
+      show_properties = true;
+    } else {
       print_help();
       exit(1);
     }
@@ -210,7 +233,7 @@ int main(int argc, char** argv) {
     }
   }
 
-  if(dir_or_file == nullptr) {
+  if (dir_or_file == nullptr) {
     print_help();
     exit(1);
   }
@@ -225,18 +248,19 @@ int main(int argc, char** argv) {
     dir = false;
   }
 
-  std::cout << "from [" << rocksdb::Slice(from_key).ToString(true)
-            << "] to [" << rocksdb::Slice(to_key).ToString(true) << "]\n";
+  fprintf(stdout, "from [%s] to [%s]\n",
+      rocksdb::Slice(from_key).ToString(true).c_str(),
+      rocksdb::Slice(to_key).ToString(true).c_str());
 
   uint64_t total_read = 0;
   for (size_t i = 0; i < filenames.size(); i++) {
     std::string filename = filenames.at(i);
     if (filename.length() <= 4 ||
         filename.rfind(".sst") != filename.length() - 4) {
-      //ignore
+      // ignore
       continue;
     }
-    if(dir) {
+    if (dir) {
       filename = std::string(dir_or_file) + "/" + filename;
     }
     rocksdb::SstFileReader reader(filename, verify_checksum,
@@ -257,5 +281,20 @@ int main(int argc, char** argv) {
         break;
       }
     }
+    if (show_properties) {
+      rocksdb::TableProperties table_properties;
+      st = reader.ReadTableProperties(&table_properties);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+      } else {
+        fprintf(stdout,
+            "Table Properties:\n"
+            "------------------------------\n"
+            "  %s", table_properties.ToString("\n  ", ": ").c_str());
+        fprintf(stdout, "# deleted keys: %lu\n",
+            rocksdb::GetDeletedKeys(
+                table_properties.user_collected_properties));
+      }
+    }
   }
 }

From 4e0298f23ccf42e3c0a72bbca1cb80a43d8e5a68 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Thu, 30 Jan 2014 17:18:17 -0800
Subject: [PATCH 65/70] Clean up arena API

Summary:
Easy thing goes first. This patch moves arena to internal dir; based
on which, the coming patch will deal with memtable_rep.

Test Plan: make check

Reviewers: haobo, sdong, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15615
---
 HISTORY.md                       |  6 ++++
 db/memtable.cc                   | 12 ++++----
 db/memtable.h                    |  4 +--
 db/plain_table_db_test.cc        |  7 +++--
 db/simple_table_db_test.cc       |  1 +
 db/skiplist.h                    |  2 +-
 db/skiplist_test.cc              | 14 +++++-----
 include/rocksdb/arena.h          | 45 ------------------------------
 include/rocksdb/memtablerep.h    |  6 +---
 table/table_test.cc              |  1 +
 tools/sst_dump.cc                |  6 ++--
 util/{arena_impl.cc => arena.cc} | 21 +++++++-------
 util/{arena_impl.h => arena.h}   | 26 ++++++++---------
 util/arena_test.cc               | 48 +++++++++++++++-----------------
 util/hash_linklist_rep.cc        |  2 +-
 util/hash_skiplist_rep.cc        |  2 +-
 util/vectorrep.cc                |  2 +-
 17 files changed, 79 insertions(+), 126 deletions(-)
 delete mode 100644 include/rocksdb/arena.h
 rename util/{arena_impl.cc => arena.cc} (82%)
 rename util/{arena_impl.h => arena.h} (81%)

diff --git a/HISTORY.md b/HISTORY.md
index 912599bc9..c8fbabd98 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,11 @@
 # Rocksdb Change Log
 
+## 2.8.0 (01/28/2014)
+
+### Public API changes
+
+* Removed arena.h from public header files.
+
 ## 2.7.0 (01/28/2014)
 
 ### Public API changes
diff --git a/db/memtable.cc b/db/memtable.cc
index deb3f7ad7..e47181298 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -17,6 +17,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "util/arena.h"
 #include "util/coding.h"
 #include "util/murmurhash.h"
 #include "util/mutexlock.h"
@@ -38,9 +40,8 @@ namespace rocksdb {
 MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
     : comparator_(cmp),
       refs_(0),
-      arena_impl_(options.arena_block_size),
-      table_(options.memtable_factory->CreateMemTableRep(comparator_,
-                                                         &arena_impl_)),
+      arena_(options.arena_block_size),
+      table_(options.memtable_factory->CreateMemTableRep(comparator_, &arena_)),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
@@ -61,8 +62,7 @@ MemTable::~MemTable() {
 }
 
 size_t MemTable::ApproximateMemoryUsage() {
-  return arena_impl_.ApproximateMemoryUsage() +
-         table_->ApproximateMemoryUsage();
+  return arena_.ApproximateMemoryUsage() + table_->ApproximateMemoryUsage();
 }
 
 int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
@@ -184,7 +184,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   const size_t encoded_len =
       VarintLength(internal_key_size) + internal_key_size +
       VarintLength(val_size) + val_size;
-  char* buf = arena_impl_.Allocate(encoded_len);
+  char* buf = arena_.Allocate(encoded_len);
   char* p = EncodeVarint32(buf, internal_key_size);
   memcpy(p, key.data(), key_size);
   p += key_size;
diff --git a/db/memtable.h b/db/memtable.h
index aca4aaf16..349359f8b 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -16,7 +16,7 @@
 #include "db/version_set.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
-#include "util/arena_impl.h"
+#include "util/arena.h"
 #include "util/dynamic_bloom.h"
 
 namespace rocksdb {
@@ -161,7 +161,7 @@ class MemTable {
 
   KeyComparator comparator_;
   int refs_;
-  ArenaImpl arena_impl_;
+  Arena arena_;
   unique_ptr<MemTableRep> table_;
 
   // These are used to manage memtable flushes to storage
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 8f56638e0..81c0c1ff4 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -11,17 +11,18 @@
 #include <algorithm>
 #include <set>
 
-#include "rocksdb/db.h"
-#include "rocksdb/filter_policy.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "rocksdb/table.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/plain_table_factory.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index 0f3b89d9b..34bdb5f60 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -31,6 +31,7 @@
 
 using std::unique_ptr;
 
+// IS THIS FILE STILL NEEDED?
 namespace rocksdb {
 
 // SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
diff --git a/db/skiplist.h b/db/skiplist.h
index d6c81688e..e713fe42a 100644
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -34,8 +34,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "port/port.h"
+#include "util/arena.h"
 #include "util/random.h"
-#include "rocksdb/arena.h"
 
 namespace rocksdb {
 
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index dcbaf0abb..b87ddcbb0 100644
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -10,7 +10,7 @@
 #include "db/skiplist.h"
 #include <set>
 #include "rocksdb/env.h"
-#include "util/arena_impl.h"
+#include "util/arena.h"
 #include "util/hash.h"
 #include "util/random.h"
 #include "util/testharness.h"
@@ -34,9 +34,9 @@ struct TestComparator {
 class SkipTest { };
 
 TEST(SkipTest, Empty) {
-  ArenaImpl arena_impl;
+  Arena arena;
   TestComparator cmp;
-  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  SkipList<Key, TestComparator> list(cmp, &arena);
   ASSERT_TRUE(!list.Contains(10));
 
   SkipList<Key, TestComparator>::Iterator iter(&list);
@@ -54,9 +54,9 @@ TEST(SkipTest, InsertAndLookup) {
   const int R = 5000;
   Random rnd(1000);
   std::set<Key> keys;
-  ArenaImpl arena_impl;
+  Arena arena;
   TestComparator cmp;
-  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  SkipList<Key, TestComparator> list(cmp, &arena);
   for (int i = 0; i < N; i++) {
     Key key = rnd.Next() % R;
     if (keys.insert(key).second) {
@@ -209,14 +209,14 @@ class ConcurrentTest {
   // Current state of the test
   State current_;
 
-  ArenaImpl arena_impl_;
+  Arena arena_;
 
   // SkipList is not protected by mu_.  We just use a single writer
   // thread to modify it.
   SkipList<Key, TestComparator> list_;
 
  public:
-  ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
+  ConcurrentTest() : list_(TestComparator(), &arena_) {}
 
   // REQUIRES: External synchronization
   void WriteStep(Random* rnd) {
diff --git a/include/rocksdb/arena.h b/include/rocksdb/arena.h
deleted file mode 100644
index 642b61408..000000000
--- a/include/rocksdb/arena.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// Arena class defines memory allocation methods. It's used by memtable and
-// skiplist.
-
-#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
-#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
-
-#include <limits>
-#include <memory>
-
-namespace rocksdb {
-
-class Arena {
- public:
-  Arena() {};
-  virtual ~Arena() {};
-
-  // Return a pointer to a newly allocated memory block of "bytes" bytes.
-  virtual char* Allocate(size_t bytes) = 0;
-
-  // Allocate memory with the normal alignment guarantees provided by malloc.
-  virtual char* AllocateAligned(size_t bytes) = 0;
-
-  // Returns an estimate of the total memory used by arena.
-  virtual const size_t ApproximateMemoryUsage() = 0;
-
-  // Returns the total number of bytes in all blocks allocated so far.
-  virtual const size_t MemoryAllocatedBytes() = 0;
-
- private:
-  // No copying allowed
-  Arena(const Arena&);
-  void operator=(const Arena&);
-};
-
-}  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index d5641cb78..e9a41aedd 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -33,11 +33,9 @@
 // iteration over the entire collection is rare since doing so requires all the
 // keys to be copied into a sorted data structure.
 
-#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
-#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
+#pragma once
 
 #include <memory>
-#include "rocksdb/slice_transform.h"
 
 namespace rocksdb {
 
@@ -199,5 +197,3 @@ extern MemTableRepFactory* NewHashLinkListRepFactory(
     const SliceTransform* transform, size_t bucket_count = 50000);
 
 }
-
-#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
diff --git a/table/table_test.cc b/table/table_test.cc
index 4f53ec4da..d165fd2f2 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -23,6 +23,7 @@
 #include "rocksdb/plain_table_factory.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/memtablerep.h"
 #include "table/meta_blocks.h"
 #include "rocksdb/plain_table_factory.h"
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index ba586aca1..1c6b9cfc1 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -291,9 +291,9 @@ int main(int argc, char** argv) {
             "Table Properties:\n"
             "------------------------------\n"
             "  %s", table_properties.ToString("\n  ", ": ").c_str());
-        fprintf(stdout, "# deleted keys: %lu\n",
-            rocksdb::GetDeletedKeys(
-                table_properties.user_collected_properties));
+        fprintf(stdout, "# deleted keys: %zd\n",
+                rocksdb::GetDeletedKeys(
+                    table_properties.user_collected_properties));
       }
     }
   }
diff --git a/util/arena_impl.cc b/util/arena.cc
similarity index 82%
rename from util/arena_impl.cc
rename to util/arena.cc
index 5125e2364..dffc8b88e 100644
--- a/util/arena_impl.cc
+++ b/util/arena.cc
@@ -7,19 +7,19 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena_impl.h"
+#include "util/arena.h"
 #include <algorithm>
 
 namespace rocksdb {
 
-const size_t ArenaImpl::kMinBlockSize = 4096;
-const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
+const size_t Arena::kMinBlockSize = 4096;
+const size_t Arena::kMaxBlockSize = 2 << 30;
 static const int kAlignUnit = sizeof(void*);
 
 size_t OptimizeBlockSize(size_t block_size) {
   // Make sure block_size is in optimal range
-  block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
-  block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
+  block_size = std::max(Arena::kMinBlockSize, block_size);
+  block_size = std::min(Arena::kMaxBlockSize, block_size);
 
   // make sure block_size is the multiple of kAlignUnit
   if (block_size % kAlignUnit != 0) {
@@ -29,19 +29,18 @@ size_t OptimizeBlockSize(size_t block_size) {
   return block_size;
 }
 
-ArenaImpl::ArenaImpl(size_t block_size)
-    : kBlockSize(OptimizeBlockSize(block_size)) {
+Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) {
   assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
          kBlockSize % kAlignUnit == 0);
 }
 
-ArenaImpl::~ArenaImpl() {
+Arena::~Arena() {
   for (const auto& block : blocks_) {
     delete[] block;
   }
 }
 
-char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
+char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   if (bytes > kBlockSize / 4) {
     // Object is more than a quarter of our block size.  Allocate it separately
     // to avoid wasting too much space in leftover bytes.
@@ -63,7 +62,7 @@ char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
   }
 }
 
-char* ArenaImpl::AllocateAligned(size_t bytes) {
+char* Arena::AllocateAligned(size_t bytes) {
   assert((kAlignUnit & (kAlignUnit - 1)) ==
          0);  // Pointer size should be a power of 2
   size_t current_mod =
@@ -83,7 +82,7 @@ char* ArenaImpl::AllocateAligned(size_t bytes) {
   return result;
 }
 
-char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
+char* Arena::AllocateNewBlock(size_t block_bytes) {
   char* block = new char[block_bytes];
   blocks_memory_ += block_bytes;
   blocks_.push_back(block);
diff --git a/util/arena_impl.h b/util/arena.h
similarity index 81%
rename from util/arena_impl.h
rename to util/arena.h
index 538385ccc..4c45417f4 100644
--- a/util/arena_impl.h
+++ b/util/arena.h
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-// ArenaImpl is an implementation of Arena class. For a request of small size,
+// Arena is an implementation of Arena class. For a request of small size,
 // it allocates a block with pre-defined block size. For a request of big
 // size, it uses malloc to directly get the requested size.
 
@@ -16,37 +16,35 @@
 #include <vector>
 #include <assert.h>
 #include <stdint.h>
-#include "rocksdb/arena.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 
-class ArenaImpl : public Arena {
+class Arena {
  public:
   // No copying allowed
-  ArenaImpl(const ArenaImpl&) = delete;
-  void operator=(const ArenaImpl&) = delete;
+  Arena(const Arena&) = delete;
+  void operator=(const Arena&) = delete;
 
   static const size_t kMinBlockSize;
   static const size_t kMaxBlockSize;
 
-  explicit ArenaImpl(size_t block_size = kMinBlockSize);
-  virtual ~ArenaImpl();
+  explicit Arena(size_t block_size = kMinBlockSize);
+  ~Arena();
 
-  virtual char* Allocate(size_t bytes) override;
+  char* Allocate(size_t bytes);
 
-  virtual char* AllocateAligned(size_t bytes) override;
+  char* AllocateAligned(size_t bytes);
 
   // Returns an estimate of the total memory usage of data allocated
   // by the arena (exclude the space allocated but not yet used for future
   // allocations).
-  virtual const size_t ApproximateMemoryUsage() {
+  const size_t ApproximateMemoryUsage() {
     return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
            alloc_bytes_remaining_;
   }
 
-  virtual const size_t MemoryAllocatedBytes() override {
-    return blocks_memory_;
-  }
+  const size_t MemoryAllocatedBytes() { return blocks_memory_; }
 
  private:
   // Number of bytes allocated in one block
@@ -72,7 +70,7 @@ class ArenaImpl : public Arena {
   size_t blocks_memory_ = 0;
 };
 
-inline char* ArenaImpl::Allocate(size_t bytes) {
+inline char* Arena::Allocate(size_t bytes) {
   // The semantics of what to return are a bit messy if we allow
   // 0-byte allocations, so we disallow them here (we don't need
   // them for our internal use).
diff --git a/util/arena_test.cc b/util/arena_test.cc
index ca6dfc99d..1b2b53175 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -7,34 +7,32 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena_impl.h"
+#include "util/arena.h"
 #include "util/random.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
 
-class ArenaImplTest { };
+class ArenaTest {};
 
-TEST(ArenaImplTest, Empty) {
-  ArenaImpl arena0;
-}
+TEST(ArenaTest, Empty) { Arena arena0; }
 
-TEST(ArenaImplTest, MemoryAllocatedBytes) {
+TEST(ArenaTest, MemoryAllocatedBytes) {
   const int N = 17;
-  size_t req_sz;  //requested size
+  size_t req_sz;  // requested size
   size_t bsz = 8192;  // block size
   size_t expected_memory_allocated;
 
-  ArenaImpl arena_impl(bsz);
+  Arena arena(bsz);
 
   // requested size > quarter of a block:
   //   allocate requested size separately
   req_sz = 3001;
   for (int i = 0; i < N; i++) {
-    arena_impl.Allocate(req_sz);
+    arena.Allocate(req_sz);
   }
   expected_memory_allocated = req_sz * N;
-  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
 
   // requested size < quarter of a block:
   //   allocate a block with the default size, then try to use unused part
@@ -42,28 +40,28 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
   //   Allocate(99) call. All the remaining calls won't lead to new allocation.
   req_sz = 99;
   for (int i = 0; i < N; i++) {
-    arena_impl.Allocate(req_sz);
+    arena.Allocate(req_sz);
   }
   expected_memory_allocated += bsz;
-  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
 
   // requested size > quarter of a block:
   //   allocate requested size separately
   req_sz = 99999999;
   for (int i = 0; i < N; i++) {
-    arena_impl.Allocate(req_sz);
+    arena.Allocate(req_sz);
   }
   expected_memory_allocated += req_sz * N;
-  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
 }
 
 // Make sure we didn't count the allocate but not used memory space in
 // Arena::ApproximateMemoryUsage()
-TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
+TEST(ArenaTest, ApproximateMemoryUsageTest) {
   const size_t kBlockSize = 4096;
   const size_t kEntrySize = kBlockSize / 8;
-	const size_t kZero = 0;
-  ArenaImpl arena(kBlockSize);
+  const size_t kZero = 0;
+  Arena arena(kBlockSize);
   ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
 
   auto num_blocks = kBlockSize / kEntrySize;
@@ -83,9 +81,9 @@ TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
   ASSERT_GT(usage, mem_usage);
 }
 
-TEST(ArenaImplTest, Simple) {
+TEST(ArenaTest, Simple) {
   std::vector<std::pair<size_t, char*>> allocated;
-  ArenaImpl arena_impl;
+  Arena arena;
   const int N = 100000;
   size_t bytes = 0;
   Random rnd(301);
@@ -104,9 +102,9 @@ TEST(ArenaImplTest, Simple) {
     }
     char* r;
     if (rnd.OneIn(10)) {
-      r = arena_impl.AllocateAligned(s);
+      r = arena.AllocateAligned(s);
     } else {
-      r = arena_impl.Allocate(s);
+      r = arena.Allocate(s);
     }
 
     for (unsigned int b = 0; b < s; b++) {
@@ -115,9 +113,9 @@ TEST(ArenaImplTest, Simple) {
     }
     bytes += s;
     allocated.push_back(std::make_pair(s, r));
-    ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
+    ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
     if (i > N / 10) {
-      ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
+      ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
     }
   }
   for (unsigned int i = 0; i < allocated.size(); i++) {
@@ -132,6 +130,4 @@ TEST(ArenaImplTest, Simple) {
 
 }  // namespace rocksdb
 
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index 844907a28..83f0f3d5a 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -7,7 +7,7 @@
 #include "util/hash_linklist_rep.h"
 
 #include "rocksdb/memtablerep.h"
-#include "rocksdb/arena.h"
+#include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "port/port.h"
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index 845137a4c..aa070bc8b 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -7,7 +7,7 @@
 #include "util/hash_skiplist_rep.h"
 
 #include "rocksdb/memtablerep.h"
-#include "rocksdb/arena.h"
+#include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "port/port.h"
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index bd2f0873d..4b8b3d552 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -11,7 +11,7 @@
 #include <algorithm>
 #include <type_traits>
 
-#include "rocksdb/arena.h"
+#include "util/arena.h"
 #include "db/memtable.h"
 #include "port/port.h"
 #include "util/mutexlock.h"

From 4f6cb17bdb74129c860ab051420acd811ce84e93 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Mon, 27 Jan 2014 21:58:46 -0800
Subject: [PATCH 66/70] First phase API clean up

Summary:
Addressed all the issues in https://reviews.facebook.net/D15447.
Now most table-related modules are hidden from user land.

Test Plan: make check

Reviewers: sdong, haobo, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15525
---
 db/builder.cc                                 |  11 +-
 db/builder.h                                  |   3 +-
 db/db_impl.cc                                 |   6 +-
 db/db_test.cc                                 |   2 +-
 db/plain_table_db_test.cc                     |   2 +-
 db/simple_table_db_test.cc                    |  22 +-
 db/table_cache.cc                             |   7 +-
 db/table_cache.h                              |   8 +-
 db/table_properties_collector_test.cc         |  11 +-
 db/version_set.cc                             |   3 +-
 include/rocksdb/table.h                       | 212 +++++-------------
 include/rocksdb/table_properties.h            |  20 +-
 table/block_based_table_builder.cc            |   2 +-
 table/block_based_table_builder.h             |   2 +-
 table/block_based_table_factory.cc            |   9 +-
 table/block_based_table_factory.h             |  18 +-
 table/block_based_table_options.h             |  31 ---
 table/block_based_table_reader.cc             |   3 +-
 table/block_based_table_reader.h              |  21 +-
 table/meta_blocks.cc                          |   2 +-
 table/plain_table_builder.h                   |   2 +-
 table/plain_table_factory.cc                  |  21 +-
 .../rocksdb => table}/plain_table_factory.h   |  48 ++--
 table/plain_table_reader.cc                   |  16 +-
 table/plain_table_reader.h                    |  22 +-
 table/table_builder.h                         |  55 +++++
 table/table_factory.h                         |  82 +++++++
 table/table_reader.h                          |  70 ++++++
 table/table_reader_bench.cc                   |   6 +-
 table/table_test.cc                           |  26 +--
 tools/sst_dump.cc                             |   5 +-
 util/options.cc                               |   3 +-
 32 files changed, 400 insertions(+), 351 deletions(-)
 delete mode 100644 table/block_based_table_options.h
 rename {include/rocksdb => table}/plain_table_factory.h (62%)
 create mode 100644 table/table_builder.h
 create mode 100644 table/table_factory.h
 create mode 100644 table/table_reader.h

diff --git a/db/builder.cc b/db/builder.cc
index 930fc180a..3b51bf88e 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -9,26 +9,26 @@
 
 #include "db/builder.h"
 
-#include "db/filename.h"
 #include "db/dbformat.h"
+#include "db/filename.h"
 #include "db/merge_helper.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "rocksdb/db.h"
-#include "rocksdb/table.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "table/block_based_table_builder.h"
+#include "table/table_factory.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
 
 class TableFactory;
 
-TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
                               CompressionType compression_type) {
-  return options.table_factory->GetTableBuilder(options, file,
+  return options.table_factory->NewTableBuilder(options, file,
                                                 compression_type);
 }
 
@@ -64,8 +64,7 @@ Status BuildTable(const std::string& dbname,
       return s;
     }
 
-    TableBuilder* builder = GetTableBuilder(options, file.get(),
-                                            compression);
+    TableBuilder* builder = NewTableBuilder(options, file.get(), compression);
 
     // the first key is the smallest key
     Slice key = iter->key();
diff --git a/db/builder.h b/db/builder.h
index 2600dc24b..189bfe6fe 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -24,8 +24,7 @@ class VersionEdit;
 class TableBuilder;
 class WritableFile;
 
-
-extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+extern TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
                                      CompressionType compression_type);
 
 // Build a Table file from the contents of *iter.  The generated file
diff --git a/db/db_impl.cc b/db/db_impl.cc
index f3ad64e74..9a7d0c178 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -48,9 +48,10 @@
 #include "table/block.h"
 #include "table/block_based_table_factory.h"
 #include "table/merger.h"
+#include "table/table_builder.h"
 #include "table/two_level_iterator.h"
-#include "util/autovector.h"
 #include "util/auto_roll_logger.h"
+#include "util/autovector.h"
 #include "util/build_version.h"
 #include "util/coding.h"
 #include "util/hash_skiplist_rep.h"
@@ -58,7 +59,6 @@
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
-#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -2138,7 +2138,7 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
         compact->compaction->enable_compression());
 
     compact->builder.reset(
-        GetTableBuilder(options_, compact->outfile.get(), compression_type));
+        NewTableBuilder(options_, compact->outfile.get(), compression_type));
   }
   LogFlush(options_.info_log);
   return s;
diff --git a/db/db_test.cc b/db/db_test.cc
index 56059371c..b1fb3f9de 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -21,7 +21,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/perf_context.h"
-#include "rocksdb/plain_table_factory.h"
+#include "table/plain_table_factory.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 81c0c1ff4..0d554278c 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -20,9 +20,9 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
-#include "rocksdb/plain_table_factory.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "table/plain_table_factory.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index 34bdb5f60..845165ec2 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -22,6 +22,8 @@
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
@@ -92,8 +94,6 @@ public:
 
   uint64_t ApproximateOffsetOf(const Slice& key) override;
 
-  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
-
   void SetupForCompaction() override;
 
   TableProperties& GetTableProperties() override;
@@ -296,11 +296,6 @@ Status SimpleTableReader::Get(
   return s;
 }
 
-bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
-                                        const Slice& key) {
-  return false;
-}
-
 uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
   return 0;
 }
@@ -541,25 +536,24 @@ public:
   const char* Name() const override {
     return "SimpleTable";
   }
-  Status GetTableReader(const Options& options, const EnvOptions& soptions,
-                        unique_ptr<RandomAccessFile> && file,
-                        uint64_t file_size,
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const;
 
-  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+  TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
                                 CompressionType compression_type) const;
 };
 
-Status SimpleTableFactory::GetTableReader(
+Status SimpleTableFactory::NewTableReader(
     const Options& options, const EnvOptions& soptions,
-    unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
 
   return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
                                  table_reader);
 }
 
-TableBuilder* SimpleTableFactory::GetTableBuilder(
+TableBuilder* SimpleTableFactory::NewTableBuilder(
     const Options& options, WritableFile* file,
     CompressionType compression_type) const {
   return new SimpleTableBuilder(options, file, compression_type);
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 527a10cba..6e48e1a90 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -13,7 +13,7 @@
 #include "db/version_edit.h"
 
 #include "rocksdb/statistics.h"
-#include "rocksdb/table.h"
+#include "table/table_reader.h"
 #include "util/coding.h"
 #include "util/stop_watch.h"
 
@@ -83,9 +83,8 @@ Status TableCache::FindTable(const EnvOptions& toptions,
         file->Hint(RandomAccessFile::RANDOM);
       }
       StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
-      s = options_->table_factory->GetTableReader(*options_, toptions,
-                                                  std::move(file), file_size,
-                                                  &table_reader);
+      s = options_->table_factory->NewTableReader(
+          *options_, toptions, std::move(file), file_size, &table_reader);
     }
 
     if (!s.ok()) {
diff --git a/db/table_cache.h b/db/table_cache.h
index ba50ae4d5..665d3b901 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -12,11 +12,13 @@
 #pragma once
 #include <string>
 #include <stdint.h>
+
 #include "db/dbformat.h"
-#include "rocksdb/env.h"
-#include "rocksdb/cache.h"
 #include "port/port.h"
-#include "rocksdb/table.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "table/table_factory.h"
+#include "table/table_reader.h"
 
 namespace rocksdb {
 
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index b7ff97b34..15cbe9213 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -7,14 +7,14 @@
 #include <memory>
 #include <string>
 
-#include "db/dbformat.h"
 #include "db/db_impl.h"
+#include "db/dbformat.h"
 #include "db/table_properties_collector.h"
-#include "rocksdb/table_properties.h"
 #include "rocksdb/table.h"
-#include "rocksdb/plain_table_factory.h"
 #include "table/block_based_table_factory.h"
 #include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/table_builder.h"
 #include "util/coding.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -88,9 +88,8 @@ void MakeBuilder(
     std::unique_ptr<FakeWritableFile>* writable,
     std::unique_ptr<TableBuilder>* builder) {
   writable->reset(new FakeWritableFile);
-  builder->reset(
-      options.table_factory->GetTableBuilder(options, writable->get(),
-                                             options.compression));
+  builder->reset(options.table_factory->NewTableBuilder(
+      options, writable->get(), options.compression));
 }
 
 // Collects keys that starts with "A" in a table.
diff --git a/db/version_set.cc b/db/version_set.cc
index a360436db..f15a3a49a 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <climits>
 #include <stdio.h>
+
 #include "db/filename.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -21,7 +22,7 @@
 #include "db/compaction.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
-#include "rocksdb/table.h"
+#include "table/table_reader.h"
 #include "table/merger.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 2d2bfacc4..a9be3e572 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -1,180 +1,72 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+//   1. Block-based table: this is the default table type that we inherited from
+//      LevelDB, which was designed for storing data in hard disk or flash
+//      device.
+//   2. Plain table: it is one of RocksDB's SST file format optimized
+//      for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
 
 #pragma once
 #include <memory>
-#include <stdint.h>
+#include <string>
+#include <unordered_map>
+
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
-#include "rocksdb/table_properties.h"
 #include "rocksdb/options.h"
+#include "rocksdb/status.h"
 
 namespace rocksdb {
 
-struct Options;
-class RandomAccessFile;
-struct ReadOptions;
-class TableCache;
-class WritableFile;
-
-using std::unique_ptr;
-
-// TableBuilder provides the interface used to build a Table
-// (an immutable and sorted map from keys to values).
-//
-// Multiple threads can invoke const methods on a TableBuilder without
-// external synchronization, but if any of the threads may call a
-// non-const method, all threads accessing the same TableBuilder must use
-// external synchronization.
-class TableBuilder {
- public:
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  virtual ~TableBuilder() {}
-
-  // Add key,value to the table being constructed.
-  // REQUIRES: key is after any previously added key according to comparator.
-  // REQUIRES: Finish(), Abandon() have not been called
-  virtual void Add(const Slice& key, const Slice& value) = 0;
-
-  // Return non-ok iff some error has been detected.
-  virtual Status status() const = 0;
-
-  // Finish building the table.
-  // REQUIRES: Finish(), Abandon() have not been called
-  virtual Status Finish() = 0;
-
-  // Indicate that the contents of this builder should be abandoned.
-  // If the caller is not going to call Finish(), it must call Abandon()
-  // before destroying this builder.
-  // REQUIRES: Finish(), Abandon() have not been called
-  virtual void Abandon() = 0;
-
-  // Number of calls to Add() so far.
-  virtual uint64_t NumEntries() const = 0;
-
-  // Size of the file generated so far.  If invoked after a successful
-  // Finish() call, returns the size of the final generated file.
-  virtual uint64_t FileSize() const = 0;
-};
+class TableFactory;
 
-// A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.  A Table may be safely accessed from
-// multiple threads without external synchronization.
-class TableReader {
- public:
-  virtual ~TableReader() {}
+// -- Block-based Table
+class FlushBlockPolicyFactory;
 
-  // Determine whether there is a chance that the current table file
-  // contains the key a key starting with iternal_prefix. The specific
-  // table implementation can use bloom filter and/or other heuristic
-  // to filter out this table as a whole.
-  virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
+// For advanced user only
+struct BlockBasedTableOptions {
+  // @flush_block_policy_factory creates the instances of flush block policy.
+  // which provides a configurable way to determine when to flush a block in
+  // the block based tables.  If not set, table builder will use the default
+  // block flush policy, which cut blocks by block size (please refer to
+  // `FlushBlockBySizePolicy`).
+  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
 
-  // Returns a new iterator over the table contents.
-  // The result of NewIterator() is initially invalid (caller must
-  // call one of the Seek methods on the iterator before using it).
-  virtual Iterator* NewIterator(const ReadOptions&) = 0;
-
-  // Given a key, return an approximate byte offset in the file where
-  // the data for that key begins (or would begin if the key were
-  // present in the file).  The returned value is in terms of file
-  // bytes, and so includes effects like compression of the underlying data.
-  // E.g., the approximate offset of the last key in the table will
-  // be close to the file length.
-  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
-
-  // Returns true if the block for the specified key is in cache.
-  // REQUIRES: key is in this table.
-  virtual bool TEST_KeyInCache(const ReadOptions& options,
-                               const Slice& key) = 0;
-
-  // Set up the table for Compaction. Might change some parameters with
-  // posix_fadvise
-  virtual void SetupForCompaction() = 0;
-
-  virtual TableProperties& GetTableProperties() = 0;
-
-  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
-  // the entry found after a call to Seek(key), until result_handler returns
-  // false, where k is the actual internal key for a row found and v as the
-  // value of the key. didIO is true if I/O is involved in the operation. May
-  // not make such a call if filter policy says that key is not present.
-  //
-  // mark_key_may_exist_handler needs to be called when it is configured to be
-  // memory only and the key is not found in the block cache, with
-  // the parameter to be handle_context.
+  // TODO(kailiu) Temporarily disable this feature by making the default value
+  // to be false.
   //
-  // readOptions is the options for the read
-  // key is the key to search for
-  virtual Status Get(
-      const ReadOptions& readOptions,
-      const Slice& key,
-      void* handle_context,
-      bool (*result_handler)(void* handle_context, const Slice& k,
-                             const Slice& v, bool didIO),
-      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+  // Indicating if we'd put index/filter blocks to the block cache.
+  // If not specified, each "table reader" object will pre-load index/filter
+  // block during table initialization.
+  bool cache_index_and_filter_blocks = false;
 };
 
-// A base class for table factories
-class TableFactory {
- public:
-  virtual ~TableFactory() {}
-
-  // The type of the table.
-  //
-  // The client of this package should switch to a new name whenever
-  // the table format implementation changes.
-  //
-  // Names starting with "rocksdb." are reserved and should not be used
-  // by any clients of this package.
-  virtual const char* Name() const = 0;
-
-  // Returns a Table object table that can fetch data from file specified
-  // in parameter file. It's the caller's responsibility to make sure
-  // file is in the correct format.
-  //
-  // GetTableReader() is called in two places:
-  // (1) TableCache::FindTable() calls the function when table cache miss
-  //     and cache the table object returned.
-  // (1) SstFileReader (for SST Dump) opens the table and dump the table
-  //     contents using the interator of the table.
-  // options and soptions are options. options is the general options.
-  // Multiple configured can be accessed from there, including and not
-  // limited to block cache and key comparators.
-  // file is a file handler to handle the file for the table
-  // file_size is the physical file size of the file
-  // table_reader is the output table reader
-  virtual Status GetTableReader(
-      const Options& options, const EnvOptions& soptions,
-      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader) const = 0;
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+// -- Plain Table
+// @user_key_len: plain table has optimization for fix-sized keys, which can be
+//                specified via user_key_len.  Alternatively, you can pass
+//                `kPlainTableVariableLength` if your keys have variable
+//                lengths.
+// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may
+//                  disable it by passing a zero.
+// @hash_table_ratio: the desired utilization of the hash table used for prefix
+//                    hashing. hash_table_ratio = number of prefixes / #buckets
+//                    in the hash table
+const uint32_t kPlainTableVariableLength = 0;
+extern TableFactory* NewPlainTableFactory(
+    uint32_t user_key_len = kPlainTableVariableLength,
+    int bloom_bits_per_key = 10, double hash_table_ratio = 0.75);
 
-  // Return a table builder to write to a file for this table type.
-  //
-  // It is called in several places:
-  // (1) When flushing memtable to a level-0 output file, it creates a table
-  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
-  // (2) During compaction, it gets the builder for writing compaction output
-  //     files in DBImpl::OpenCompactionOutputFile().
-  // (3) When recovering from transaction logs, it creates a table builder to
-  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
-  //     by calling BuildTable())
-  // (4) When running Repairer, it creates a table builder to convert logs to
-  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
-  //
-  // options is the general options. Multiple configured can be acceseed from
-  // there, including and not limited to compression options.
-  // file is a handle of a writable file. It is the caller's responsibility to
-  // keep the file open and close the file after closing the table builder.
-  // compression_type is the compression type to use in this table.
-  virtual TableBuilder* GetTableBuilder(
-      const Options& options, WritableFile* file,
-      CompressionType compression_type) const = 0;
-};
 }  // namespace rocksdb
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index b1b52e87a..1d4b9e344 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -1,23 +1,20 @@
-// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 
 #include <string>
 #include <unordered_map>
-
 #include "rocksdb/status.h"
 
 namespace rocksdb {
 
+// -- Table Properties
 // Other than basic table properties, each table may also have the user
 // collected properties.
 // The value of the user-collected properties are encoded as raw bytes --
 // users have to interprete these values by themselves.
-typedef
-  std::unordered_map<std::string, std::string>
-  UserCollectedProperties;
+typedef std::unordered_map<std::string, std::string> UserCollectedProperties;
 
 // TableProperties contains a bunch of read-only properties of its associated
 // table.
@@ -51,9 +48,8 @@ struct TableProperties {
 
   // convert this object to a human readable form
   //   @prop_delim: delimiter for each property.
-  std::string ToString(
-      const std::string& prop_delim = "; ",
-      const std::string& kv_delim = "=") const;
+  std::string ToString(const std::string& prop_delim = "; ",
+                       const std::string& kv_delim = "=") const;
 };
 
 // table properties' human-readable names in the property block.
@@ -77,7 +73,7 @@ extern const std::string kPropertiesBlock;
 //  of callback functions that will be invoked during table building.
 class TablePropertiesCollector {
  public:
-  virtual ~TablePropertiesCollector() { }
+  virtual ~TablePropertiesCollector() {}
 
   // Add() will be called when a new key/value pair is inserted into the table.
   // @params key    the original key that is inserted into the table.
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index e81d99ede..feda28c1a 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -17,7 +17,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
-#include "rocksdb/table.h"
+#include "table/table_builder.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
index 710bfd5a1..0752eb399 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@@ -12,7 +12,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
-#include "rocksdb/table.h"
+#include "table/table_builder.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index a9cd35a68..3cf064867 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -18,7 +18,7 @@
 
 namespace rocksdb {
 
-Status BlockBasedTableFactory::GetTableReader(
+Status BlockBasedTableFactory::NewTableReader(
     const Options& options, const EnvOptions& soptions,
     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
@@ -26,7 +26,7 @@ Status BlockBasedTableFactory::GetTableReader(
                                std::move(file), file_size, table_reader);
 }
 
-TableBuilder* BlockBasedTableFactory::GetTableBuilder(
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
     const Options& options, WritableFile* file,
     CompressionType compression_type) const {
   auto flush_block_policy_factory = 
@@ -63,4 +63,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
   return table_builder;
 }
 
+TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options) {
+  return new BlockBasedTableFactory(table_options);
+}
+
 }  // namespace rocksdb
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index 5a4d1bd6e..bdae45a87 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -14,7 +14,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_options.h"
+#include "table/table_factory.h"
 
 namespace rocksdb {
 
@@ -22,29 +22,23 @@ struct Options;
 struct EnvOptions;
 
 using std::unique_ptr;
-class Status;
-class RandomAccessFile;
-class WritableFile;
-class Table;
-class TableBuilder;
-class BlockBasedTable;
 class BlockBasedTableBuilder;
 
-class BlockBasedTableFactory: public TableFactory {
+class BlockBasedTableFactory : public TableFactory {
  public:
-  BlockBasedTableFactory() : BlockBasedTableFactory(BlockBasedTableOptions()) {}
-  explicit BlockBasedTableFactory(const BlockBasedTableOptions& table_options)
+  explicit BlockBasedTableFactory(
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions())
       : table_options_(table_options) {}
 
   ~BlockBasedTableFactory() {}
 
   const char* Name() const override { return "BlockBasedTable"; }
 
-  Status GetTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const override;
 
-  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+  TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
                                 CompressionType compression_type)
       const override;
 
diff --git a/table/block_based_table_options.h b/table/block_based_table_options.h
deleted file mode 100644
index f5774e2bf..000000000
--- a/table/block_based_table_options.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#pragma once
-#include <memory>
-
-namespace rocksdb {
-
-class FlushBlockPolicyFactory;
-
-struct BlockBasedTableOptions {
-  // @flush_block_policy_factory creates the instances of flush block policy.
-  // which provides a configurable way to determine when to flush a block in
-  // the block based tables.  If not set, table builder will use the default
-  // block flush policy, which cut blocks by block size (please refer to
-  // `FlushBlockBySizePolicy`).
-  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
-
-  // TODO(kailiu) Temporarily disable this feature by making the default value
-  // to be false. Also in master branch, this file is non-public so no user
-  // will be able to change the value of `cache_index_and_filter_blocks`.
-  //
-  // Indicating if we'd put index/filter blocks to the block cache.
-  // If not specified, each "table reader" object will pre-load index/filter
-  // block during table initialization.
-  bool cache_index_and_filter_blocks = false;
-};
-
-}  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 8f7470330..da100fee9 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -27,7 +27,6 @@
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
-#include "table/block_based_table_options.h"
 
 namespace rocksdb {
 
@@ -338,7 +337,7 @@ void BlockBasedTable::SetupForCompaction() {
   compaction_optimized_ = true;
 }
 
-TableProperties& BlockBasedTable::GetTableProperties() {
+const TableProperties& BlockBasedTable::GetTableProperties() {
   return rep_->table_properties;
 }
 
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 34411f87f..d540f65ad 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -14,8 +14,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/table_properties.h"
-#include "rocksdb/table.h"
+#include "table/table_reader.h"
 #include "util/coding.h"
 
 namespace rocksdb {
@@ -62,14 +61,12 @@ class BlockBasedTable : public TableReader {
   // call one of the Seek methods on the iterator before using it).
   Iterator* NewIterator(const ReadOptions&) override;
 
-  Status Get(
-        const ReadOptions& readOptions,
-        const Slice& key,
-        void* handle_context,
-        bool (*result_handler)(void* handle_context, const Slice& k,
-                               const Slice& v, bool didIO),
-        void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
-    override;
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             void* handle_context,
+             bool (*result_handler)(void* handle_context, const Slice& k,
+                                    const Slice& v, bool didIO),
+             void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
+      override;
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
@@ -81,13 +78,13 @@ class BlockBasedTable : public TableReader {
 
   // Returns true if the block for the specified key is in cache.
   // REQUIRES: key is in this table.
-  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
 
   // Set up the table for Compaction. Might change some parameters with
   // posix_fadvise
   void SetupForCompaction() override;
 
-  TableProperties& GetTableProperties() override;
+  const TableProperties& GetTableProperties() override;
 
   ~BlockBasedTable();
 
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 5d2d94175..294b96d74 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -7,7 +7,7 @@
 
 #include <map>
 
-#include "rocksdb/table_properties.h"
+#include "rocksdb/table.h"
 #include "table/block.h"
 #include "table/format.h"
 #include "util/coding.h"
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
index f4be46828..1793d1d72 100644
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
-#include "rocksdb/table.h"
+#include "table/table_builder.h"
 #include "rocksdb/table_properties.h"
 
 namespace rocksdb {
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index bf941a62d..45ae71c64 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "rocksdb/plain_table_factory.h"
+#include "table/plain_table_factory.h"
 
 #include <memory>
 #include <stdint.h>
@@ -12,19 +12,26 @@
 
 namespace rocksdb {
 
-Status PlainTableFactory::GetTableReader(const Options& options,
+Status PlainTableFactory::NewTableReader(const Options& options,
                                          const EnvOptions& soptions,
-                                         unique_ptr<RandomAccessFile> && file,
+                                         unique_ptr<RandomAccessFile>&& file,
                                          uint64_t file_size,
-                                         unique_ptr<TableReader>* table)
-     const {
+                                         unique_ptr<TableReader>* table) const {
   return PlainTableReader::Open(options, soptions, std::move(file), file_size,
-                                table, bloom_num_bits_, hash_table_ratio_);
+                                table, bloom_bits_per_key_, hash_table_ratio_);
 }
 
-TableBuilder* PlainTableFactory::GetTableBuilder(
+TableBuilder* PlainTableFactory::NewTableBuilder(
     const Options& options, WritableFile* file,
     CompressionType compression_type) const {
   return new PlainTableBuilder(options, file, user_key_len_);
 }
+
+extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
+                                          int bloom_bits_per_key,
+                                          double hash_table_ratio) {
+  return new PlainTableFactory(user_key_len, bloom_bits_per_key,
+                               hash_table_ratio);
+}
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/plain_table_factory.h b/table/plain_table_factory.h
similarity index 62%
rename from include/rocksdb/plain_table_factory.h
rename to table/plain_table_factory.h
index 5cf59d23a..55680a3ec 100644
--- a/include/rocksdb/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -8,6 +8,7 @@
 
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
+#include "table/table_factory.h"
 
 namespace rocksdb {
 
@@ -37,40 +38,35 @@ class TableBuilder;
 // |                                            |
 // |        ......                              |
 // +-----------------+--------------------------+
-// If user_key_length = kVariableLength, it means the key is variable length,
-// there will be an extra field for key size encoded before every key.
-class PlainTableFactory: public TableFactory {
-public:
-  ~PlainTableFactory() {
-  }
+// If user_key_length = kPlainTableVariableLength, it means the key is variable
+// length, there will be an extra field for key size encoded before every key.
+class PlainTableFactory : public TableFactory {
+ public:
+  ~PlainTableFactory() {}
   // user_key_size is the length of the user key. If it is set to be
-  // kVariableLength, then it means variable length. Otherwise, all the
-  // keys need to have the fix length of this value. bloom_num_bits is
+  // kPlainTableVariableLength, then it means variable length. Otherwise, all
+  // the keys need to have the fix length of this value. bloom_bits_per_key is
   // number of bits used for bloom filer per key. hash_table_ratio is
   // the desired utilization of the hash table used for prefix hashing.
   // hash_table_ratio = number of prefixes / #buckets in the hash table
-  explicit PlainTableFactory(uint32_t user_key_len = kVariableLength,
-                             int bloom_num_bits = 0,
-                             double hash_table_ratio = 0.75) :
-      user_key_len_(user_key_len), bloom_num_bits_(bloom_num_bits),
-      hash_table_ratio_(hash_table_ratio) {
-  }
-  const char* Name() const override {
-    return "PlainTable";
-  }
-  Status GetTableReader(const Options& options, const EnvOptions& soptions,
-                        unique_ptr<RandomAccessFile> && file,
-                        uint64_t file_size,
+  explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
+                             int bloom_bits_per_key = 0,
+                             double hash_table_ratio = 0.75)
+      : user_key_len_(user_key_len),
+        bloom_bits_per_key_(bloom_bits_per_key),
+        hash_table_ratio_(hash_table_ratio) {}
+  const char* Name() const override { return "PlainTable"; }
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table) const override;
 
-  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
-                                CompressionType compression_type) const
-                                    override;
+  TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
+                                CompressionType compression_type)
+      const override;
 
-  static const uint32_t kVariableLength = 0;
-private:
+ private:
   uint32_t user_key_len_;
-  int bloom_num_bits_;
+  int bloom_bits_per_key_;
   double hash_table_ratio_;
 };
 
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index ab2e90c25..5d769eea2 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -15,13 +15,13 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/plain_table_factory.h"
 
 #include "table/block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
+#include "table/plain_table_factory.h"
 
 #include "util/coding.h"
 #include "util/dynamic_bloom.h"
@@ -103,10 +103,10 @@ PlainTableReader::~PlainTableReader() {
 
 Status PlainTableReader::Open(const Options& options,
                               const EnvOptions& soptions,
-                              unique_ptr<RandomAccessFile> && file,
+                              unique_ptr<RandomAccessFile>&& file,
                               uint64_t file_size,
                               unique_ptr<TableReader>* table_reader,
-                              const int bloom_num_bits,
+                              const int bloom_bits_per_key,
                               double hash_table_ratio) {
   assert(options.allow_mmap_reads);
 
@@ -122,8 +122,9 @@ Status PlainTableReader::Open(const Options& options,
     return s;
   }
 
-  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
-      soptions, file_size, bloom_num_bits, hash_table_ratio, table_properties));
+  std::unique_ptr<PlainTableReader> new_reader(
+      new PlainTableReader(soptions, file_size, bloom_bits_per_key,
+                           hash_table_ratio, table_properties));
   new_reader->file_ = std::move(file);
   new_reader->options_ = options;
 
@@ -556,11 +557,6 @@ Status PlainTableReader::Get(
   return Status::OK();
 }
 
-bool PlainTableReader::TEST_KeyInCache(const ReadOptions& options,
-                                       const Slice& key) {
-  return false;
-}
-
 uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
   return 0;
 }
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 4e866219e..d223a13d5 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -7,11 +7,14 @@
 #include <memory>
 #include <vector>
 #include <stdint.h>
+
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
-#include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
-#include "rocksdb/plain_table_factory.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_reader.h"
+#include "table/plain_table_factory.h"
 
 namespace rocksdb {
 
@@ -27,6 +30,7 @@ class DynamicBloom;
 
 using std::unique_ptr;
 using std::unordered_map;
+extern const uint32_t kPlainTableVariableLength;
 
 // Based on following output file format shown in plain_table_factory.h
 // When opening the output file, IndexedTableReader creates a hash table
@@ -40,8 +44,8 @@ class PlainTableReader: public TableReader {
  public:
   static Status Open(const Options& options, const EnvOptions& soptions,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                     unique_ptr<TableReader>* table, const int bloom_num_bits,
-                     double hash_table_ratio);
+                     unique_ptr<TableReader>* table,
+                     const int bloom_bits_per_key, double hash_table_ratio);
 
   bool PrefixMayMatch(const Slice& internal_prefix);
 
@@ -54,16 +58,12 @@ class PlainTableReader: public TableReader {
 
   uint64_t ApproximateOffsetOf(const Slice& key);
 
-  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
-
   void SetupForCompaction();
 
-  TableProperties& GetTableProperties() {
-    return table_properties_;
-  }
+  const TableProperties& GetTableProperties() { return table_properties_; }
 
   PlainTableReader(const EnvOptions& storage_options, uint64_t file_size,
-                   int bloom_num_bits, double hash_table_ratio,
+                   int bloom_bits_per_key, double hash_table_ratio,
                    const TableProperties& table_properties);
   ~PlainTableReader();
 
@@ -104,7 +104,7 @@ class PlainTableReader: public TableReader {
   static const size_t kIndexIntervalForSamePrefixKeys = 16;
 
   bool IsFixedLength() const {
-    return user_key_len_ != PlainTableFactory::kVariableLength;
+    return user_key_len_ != kPlainTableVariableLength;
   }
 
   size_t GetFixedInternalKeyLength() const {
diff --git a/table/table_builder.h b/table/table_builder.h
new file mode 100644
index 000000000..ee32cff86
--- /dev/null
+++ b/table/table_builder.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Slice;
+class Status;
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_factory.h b/table/table_factory.h
new file mode 100644
index 000000000..d4b222657
--- /dev/null
+++ b/table/table_factory.h
@@ -0,0 +1,82 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <memory>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+using std::unique_ptr;
+
+class RandomAccessFile;
+class TableBuilder;
+class TableReader;
+class WritableFile;
+struct EnvOptions;
+struct Options;
+
+// A base class for table factories
+class TableFactory {
+ public:
+  virtual ~TableFactory() {}
+
+  // The type of the table.
+  //
+  // The client of this package should switch to a new name whenever
+  // the table format implementation changes.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // NewTableReader() is called in two places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (1) SstFileReader (for SST Dump) opens the table and dump the table
+  //     contents using the interator of the table.
+  // options and soptions are options. options is the general options.
+  // Multiple configured can be accessed from there, including and not
+  // limited to block cache and key comparators.
+  // file is a file handler to handle the file for the table
+  // file_size is the physical file size of the file
+  // table_reader is the output table reader
+  virtual Status NewTableReader(
+      const Options& options, const EnvOptions& soptions,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // options is the general options. Multiple configured can be acceseed from
+  // there, including and not limited to compression options.
+  // file is a handle of a writable file. It is the caller's responsibility to
+  // keep the file open and close the file after closing the table builder.
+  // compression_type is the compression type to use in this table.
+  virtual TableBuilder* NewTableBuilder(
+      const Options& options, WritableFile* file,
+      CompressionType compression_type) const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
new file mode 100644
index 000000000..983c998e7
--- /dev/null
+++ b/table/table_reader.h
@@ -0,0 +1,70 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Iterator;
+class Slice;
+struct ReadOptions;
+struct TableProperties;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Determine whether there is a chance that the current table file
+  // contains the key a key starting with iternal_prefix. The specific
+  // table implementation can use bloom filter and/or other heuristic
+  // to filter out this table as a whole.
+  virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  virtual Iterator* NewIterator(const ReadOptions&) = 0;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual const TableProperties& GetTableProperties() = 0;
+
+  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
+  // the entry found after a call to Seek(key), until result_handler returns
+  // false, where k is the actual internal key for a row found and v as the
+  // value of the key. didIO is true if I/O is involved in the operation. May
+  // not make such a call if filter policy says that key is not present.
+  //
+  // mark_key_may_exist_handler needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache, with
+  // the parameter to be handle_context.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  virtual Status Get(
+      const ReadOptions& readOptions, const Slice& key, void* handle_context,
+      bool (*result_handler)(void* handle_context, const Slice& k,
+                             const Slice& v, bool didIO),
+      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index a491d168f..88436c1f3 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -12,7 +12,7 @@
 #include "db/dbformat.h"
 #include "port/atomic_pointer.h"
 #include "table/block_based_table_factory.h"
-#include "rocksdb/plain_table_factory.h"
+#include "table/plain_table_factory.h"
 #include "util/histogram.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -71,7 +71,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   Status s;
   if (!through_db) {
     env->NewWritableFile(file_name, &file, env_options);
-    tb = opts.table_factory->GetTableBuilder(opts, file.get(),
+    tb = opts.table_factory->NewTableBuilder(opts, file.get(),
                                              CompressionType::kNoCompression);
   } else {
     s = DB::Open(opts, dbname, &db);
@@ -102,7 +102,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
-    s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf),
+    s = opts.table_factory->NewTableReader(opts, env_options, std::move(raf),
                                            file_size, &table_reader);
   }
 
diff --git a/table/table_test.cc b/table/table_test.cc
index d165fd2f2..39f341131 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -20,19 +20,18 @@
 
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
-#include "rocksdb/plain_table_factory.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/memtablerep.h"
-#include "table/meta_blocks.h"
-#include "rocksdb/plain_table_factory.h"
+#include "table/block.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
-#include "table/block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
 
 #include "util/random.h"
 #include "util/testharness.h"
@@ -303,9 +302,8 @@ class TableConstructor: public Constructor {
     Reset();
     sink_.reset(new StringSink());
     unique_ptr<TableBuilder> builder;
-    builder.reset(
-        options.table_factory->GetTableBuilder(options, sink_.get(),
-                                               options.compression));
+    builder.reset(options.table_factory->NewTableBuilder(options, sink_.get(),
+                                                         options.compression));
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
@@ -329,7 +327,7 @@ class TableConstructor: public Constructor {
     uniq_id_ = cur_uniq_id_++;
     source_.reset(new StringSource(sink_->contents(), uniq_id_,
                                    options.allow_mmap_reads));
-    return options.table_factory->GetTableReader(
+    return options.table_factory->NewTableReader(
         options, soptions, std::move(source_), sink_->contents().size(),
         &table_reader_);
   }
@@ -351,10 +349,9 @@ class TableConstructor: public Constructor {
     source_.reset(
         new StringSource(sink_->contents(), uniq_id_,
                          options.allow_mmap_reads));
-    return options.table_factory->GetTableReader(options, soptions,
-                                                 std::move(source_),
-                                                 sink_->contents().size(),
-                                                 &table_reader_);
+    return options.table_factory->NewTableReader(
+        options, soptions, std::move(source_), sink_->contents().size(),
+        &table_reader_);
   }
 
   virtual TableReader* table_reader() {
@@ -1210,8 +1207,9 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   ASSERT_OK(iter->status());
 
   ASSERT_OK(c.Reopen(opt));
+  auto table_reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
   for (const std::string& key : keys) {
-    ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key));
+    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
   }
 }
 
@@ -1220,7 +1218,7 @@ TEST(PlainTableTest, BasicPlainTableProperties) {
   PlainTableFactory factory(8, 8, 0);
   StringSink sink;
   std::unique_ptr<TableBuilder> builder(
-      factory.GetTableBuilder(Options(), &sink, kNoCompression));
+      factory.NewTableBuilder(Options(), &sink, kNoCompression));
 
   for (char c = 'a'; c <= 'z'; ++c) {
     std::string key(16, c);
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 1c6b9cfc1..7eb339659 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -80,9 +80,8 @@ Status SstFileReader::NewTableReader(const std::string& file_path) {
   uint64_t file_size;
   table_options_.env->GetFileSize(file_path, &file_size);
   unique_ptr<TableFactory> table_factory;
-  s = table_options_.table_factory->GetTableReader(table_options_, soptions_,
-                                                  std::move(file_), file_size,
-                                                  &table_reader_);
+  s = table_options_.table_factory->NewTableReader(
+      table_options_, soptions_, std::move(file_), file_size, &table_reader_);
   return s;
 }
 
diff --git a/util/options.cc b/util/options.cc
index 6f39a2464..b2b4f8688 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -16,10 +16,11 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
-#include "rocksdb/merge_operator.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based_table_factory.h"
 

From d169b676802a8beac2ab6ace71c0e7f0fc54f77d Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 27 Jan 2014 13:53:22 -0800
Subject: [PATCH 67/70] [Performance Branch] PlainTable to encode rows with
 seqID 0, value type using 1 internal byte.

Summary: In PlainTable, use one single byte to represent 8 bytes of internal bytes, if seqID = 0 and it is value type (which should be common for bottom most files). It is to save 7 bytes for uncompressed cases.

Test Plan: make all check

Reviewers: haobo, dhruba, kailiu

Reviewed By: haobo

CC: igor, leveldb

Differential Revision: https://reviews.facebook.net/D15489
---
 db/builder.cc                         |  36 +++---
 db/builder.h                          |  16 ++-
 db/db_impl.cc                         |  25 ++---
 db/db_iter.cc                         |   2 +-
 db/db_test.cc                         |   1 +
 db/dbformat.cc                        |  24 +++-
 db/dbformat.h                         |  11 +-
 db/repair.cc                          |   8 +-
 db/simple_table_db_test.cc            |  37 ++++---
 db/table_cache.cc                     |  36 +++---
 db/table_cache.h                      |  29 ++---
 db/table_properties_collector_test.cc |  44 ++++----
 db/version_set.cc                     | 154 ++++++++++++--------------
 db/write_batch_test.cc                |   2 +-
 include/rocksdb/options.h             |   1 +
 include/rocksdb/table.h               |   2 -
 table/block_based_table_builder.cc    |  35 +++---
 table/block_based_table_builder.h     |   1 +
 table/block_based_table_factory.cc    |  16 +--
 table/block_based_table_factory.h     |   7 +-
 table/block_based_table_reader.cc     |  49 ++++----
 table/block_based_table_reader.h      |  15 ++-
 table/block_builder.cc                |   6 +-
 table/block_builder.h                 |   2 +-
 table/block_test.cc                   |   5 +-
 table/filter_block.cc                 |  11 +-
 table/filter_block.h                  |   3 +-
 table/filter_block_test.cc            |   6 +-
 table/plain_table_builder.cc          |  28 +++--
 table/plain_table_factory.cc          |  11 +-
 table/plain_table_factory.h           |  10 +-
 table/plain_table_reader.cc           | 117 +++++++++++++------
 table/plain_table_reader.h            |  27 +++--
 table/table_factory.h                 |   5 +-
 table/table_reader.h                  |   3 +-
 table/table_reader_bench.cc           |   6 +-
 table/table_test.cc                   | 114 +++++++++++++------
 table/two_level_iterator.cc           |  53 +++++----
 table/two_level_iterator.h            |  14 +--
 tools/sst_dump.cc                     |   4 +-
 util/testutil.h                       |  25 +++++
 41 files changed, 592 insertions(+), 409 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index 3b51bf88e..96fb29eef 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -26,20 +26,18 @@ namespace rocksdb {
 
 class TableFactory;
 
-TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
+TableBuilder* NewTableBuilder(const Options& options,
+                              const InternalKeyComparator& internal_comparator,
+                              WritableFile* file,
                               CompressionType compression_type) {
-  return options.table_factory->NewTableBuilder(options, file,
-                                                compression_type);
+  return options.table_factory->NewTableBuilder(options, internal_comparator,
+                                                file, compression_type);
 }
 
-Status BuildTable(const std::string& dbname,
-                  Env* env,
-                  const Options& options,
-                  const EnvOptions& soptions,
-                  TableCache* table_cache,
-                  Iterator* iter,
-                  FileMetaData* meta,
-                  const Comparator* user_comparator,
+Status BuildTable(const std::string& dbname, Env* env, const Options& options,
+                  const EnvOptions& soptions, TableCache* table_cache,
+                  Iterator* iter, FileMetaData* meta,
+                  const InternalKeyComparator& internal_comparator,
                   const SequenceNumber newest_snapshot,
                   const SequenceNumber earliest_seqno_in_memtable,
                   const CompressionType compression) {
@@ -64,7 +62,8 @@ Status BuildTable(const std::string& dbname,
       return s;
     }
 
-    TableBuilder* builder = NewTableBuilder(options, file.get(), compression);
+    TableBuilder* builder =
+        NewTableBuilder(options, internal_comparator, file.get(), compression);
 
     // the first key is the smallest key
     Slice key = iter->key();
@@ -72,8 +71,8 @@ Status BuildTable(const std::string& dbname,
     meta->smallest_seqno = GetInternalKeySeqno(key);
     meta->largest_seqno = meta->smallest_seqno;
 
-    MergeHelper merge(user_comparator, options.merge_operator.get(),
-                      options.info_log.get(),
+    MergeHelper merge(internal_comparator.user_comparator(),
+                      options.merge_operator.get(), options.info_log.get(),
                       true /* internal key corruption is not ok */);
 
     if (purge) {
@@ -102,8 +101,8 @@ Status BuildTable(const std::string& dbname,
         // If the key is the same as the previous key (and it is not the
         // first key), then we skip it, since it is an older version.
         // Otherwise we output the key and mark it as the "new" previous key.
-        if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
-                                                       this_ikey.user_key)) {
+        if (!is_first_key && !internal_comparator.user_comparator()->Compare(
+                                  prev_ikey.user_key, this_ikey.user_key)) {
           // seqno within the same key are in decreasing order
           assert(this_ikey.sequence < prev_ikey.sequence);
         } else {
@@ -201,9 +200,8 @@ Status BuildTable(const std::string& dbname,
 
     if (s.ok()) {
       // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(),
-                                              soptions,
-                                              *meta);
+      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
+                                              internal_comparator, *meta);
       s = it->status();
       delete it;
     }
diff --git a/db/builder.h b/db/builder.h
index 189bfe6fe..630162968 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -24,22 +24,20 @@ class VersionEdit;
 class TableBuilder;
 class WritableFile;
 
-extern TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
-                                     CompressionType compression_type);
+extern TableBuilder* NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type);
 
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to meta->number.  On success, the rest of
 // *meta will be filled with metadata about the generated table.
 // If no data is present in *iter, meta->file_size will be set to
 // zero, and no Table file will be produced.
-extern Status BuildTable(const std::string& dbname,
-                         Env* env,
-                         const Options& options,
-                         const EnvOptions& soptions,
-                         TableCache* table_cache,
-                         Iterator* iter,
+extern Status BuildTable(const std::string& dbname, Env* env,
+                         const Options& options, const EnvOptions& soptions,
+                         TableCache* table_cache, Iterator* iter,
                          FileMetaData* meta,
-                         const Comparator* user_comparator,
+                         const InternalKeyComparator& internal_comparator,
                          const SequenceNumber newest_snapshot,
                          const SequenceNumber earliest_seqno_in_memtable,
                          const CompressionType compression);
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 9a7d0c178..e7351d09e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -127,7 +127,6 @@ Options SanitizeOptions(const std::string& dbname,
                         const InternalFilterPolicy* ipolicy,
                         const Options& src) {
   Options result = src;
-  result.comparator = icmp;
   result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
   // result.max_open_files means an "infinite" open files.
   if (result.max_open_files != -1) {
@@ -1107,9 +1106,8 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
   {
     mutex_.Unlock();
     s = BuildTable(dbname_, env_, options_, storage_options_,
-                   table_cache_.get(), iter, &meta,
-                   user_comparator(), newest_snapshot,
-                   earliest_seqno_in_memtable,
+                   table_cache_.get(), iter, &meta, internal_comparator_,
+                   newest_snapshot, earliest_seqno_in_memtable,
                    GetCompressionFlush(options_));
     LogFlush(options_.info_log);
     mutex_.Lock();
@@ -1173,9 +1171,9 @@ Status DBImpl::WriteLevel0Table(autovector<MemTable*>& mems, VersionEdit* edit,
         (unsigned long)meta.number);
 
     s = BuildTable(dbname_, env_, options_, storage_options_,
-                   table_cache_.get(), iter, &meta,
-                   user_comparator(), newest_snapshot,
-                   earliest_seqno_in_memtable, GetCompressionFlush(options_));
+                   table_cache_.get(), iter, &meta, internal_comparator_,
+                   newest_snapshot, earliest_seqno_in_memtable,
+                   GetCompressionFlush(options_));
     LogFlush(options_.info_log);
     delete iter;
     Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s",
@@ -2137,8 +2135,9 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
         options_, compact->compaction->output_level(),
         compact->compaction->enable_compression());
 
-    compact->builder.reset(
-        NewTableBuilder(options_, compact->outfile.get(), compression_type));
+    compact->builder.reset(NewTableBuilder(options_, internal_comparator_,
+                                           compact->outfile.get(),
+                                           compression_type));
   }
   LogFlush(options_.info_log);
   return s;
@@ -2186,9 +2185,8 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   if (s.ok() && current_entries > 0) {
     // Verify that the table is usable
     FileMetaData meta(output_number, current_bytes);
-    Iterator* iter = table_cache_->NewIterator(ReadOptions(),
-                                               storage_options_,
-                                               meta);
+    Iterator* iter = table_cache_->NewIterator(ReadOptions(), storage_options_,
+                                               internal_comparator_, meta);
     s = iter->status();
     delete iter;
     if (s.ok()) {
@@ -2522,8 +2520,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         // If this is the bottommost level (no files in lower levels)
         // and the earliest snapshot is larger than this seqno
         // then we can squash the seqno to zero.
-        if (options_.compaction_style == kCompactionStyleLevel &&
-            bottommost_level && ikey.sequence < earliest_snapshot &&
+        if (bottommost_level && ikey.sequence < earliest_snapshot &&
             ikey.type != kTypeMerge) {
           assert(ikey.type != kTypeDeletion);
           // make a copy because updating in place would cause problems
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 1b5ae5688..b8d9038a1 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -235,7 +235,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
             valid_ = true;
             MergeValuesNewToOld();  // Go to a different state machine
             return;
-          case kTypeLogData:
+          default:
             assert(false);
             break;
         }
diff --git a/db/db_test.cc b/db/db_test.cc
index b1fb3f9de..0c4e21c85 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -11,6 +11,7 @@
 #include <set>
 #include <unistd.h>
 
+#include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/version_set.h"
diff --git a/db/dbformat.cc b/db/dbformat.cc
index 3d7e61010..43560bc83 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -6,9 +6,9 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
 
 #include <stdio.h>
-#include "db/dbformat.h"
 #include "port/port.h"
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
@@ -72,6 +72,28 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
   return r;
 }
 
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+                                   const ParsedInternalKey& b) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(a.user_key, b.user_key);
+  BumpPerfCount(&perf_context.user_key_comparison_count);
+  if (r == 0) {
+    if (a.sequence > b.sequence) {
+      r = -1;
+    } else if (a.sequence < b.sequence) {
+      r = +1;
+    } else if (a.type > b.type) {
+      r = -1;
+    } else if (a.type < b.type) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
 void InternalKeyComparator::FindShortestSeparator(
       std::string* start,
       const Slice& limit) const {
diff --git a/db/dbformat.h b/db/dbformat.h
index 64a2c9f05..e3dbe0ba3 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -25,12 +25,16 @@ class InternalKey;
 // Value types encoded as the last component of internal keys.
 // DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
 // data structures.
-enum ValueType {
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
   kTypeDeletion = 0x0,
   kTypeValue = 0x1,
   kTypeMerge = 0x2,
-  kTypeLogData = 0x3
+  kTypeLogData = 0x3,
+  kMaxValue = 0x7F
 };
+
 // kValueTypeForSeek defines the ValueType that should be passed when
 // constructing a ParsedInternalKey object for seeking to a particular
 // sequence number (since we sort sequence numbers in decreasing order
@@ -96,6 +100,7 @@ class InternalKeyComparator : public Comparator {
     name_("rocksdb.InternalKeyComparator:" +
           std::string(user_comparator_->Name())) {
   }
+  virtual ~InternalKeyComparator() {}
 
   virtual const char* Name() const;
   virtual int Compare(const Slice& a, const Slice& b) const;
@@ -107,6 +112,7 @@ class InternalKeyComparator : public Comparator {
   const Comparator* user_comparator() const { return user_comparator_; }
 
   int Compare(const InternalKey& a, const InternalKey& b) const;
+  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
 };
 
 // Filter policy wrapper that converts from internal keys to user keys
@@ -163,6 +169,7 @@ inline bool ParseInternalKey(const Slice& internal_key,
   unsigned char c = num & 0xff;
   result->sequence = num >> 8;
   result->type = static_cast<ValueType>(c);
+  assert(result->type <= ValueType::kMaxValue);
   result->user_key = Slice(internal_key.data(), n - 8);
   return (c <= static_cast<unsigned char>(kValueTypeForSeek));
 }
diff --git a/db/repair.cc b/db/repair.cc
index f72bca6b7..5a6cba44d 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -222,10 +222,8 @@ class Repairer {
     FileMetaData meta;
     meta.number = next_file_number_++;
     Iterator* iter = mem->NewIterator();
-    status = BuildTable(dbname_, env_, options_, storage_options_,
-                        table_cache_, iter, &meta,
-                        icmp_.user_comparator(), 0, 0,
-                        kNoCompression);
+    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
+                        iter, &meta, icmp_, 0, 0, kNoCompression);
     delete iter;
     delete mem->Unref();
     mem = nullptr;
@@ -267,7 +265,7 @@ class Repairer {
     if (status.ok()) {
       FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
       Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), storage_options_, dummy_meta);
+          ReadOptions(), storage_options_, icmp_, dummy_meta);
       bool empty = true;
       ParsedInternalKey parsed;
       t->min_sequence = 0;
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index 845165ec2..3d1420c0c 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -87,10 +87,10 @@ public:
 
   Iterator* NewIterator(const ReadOptions&) override;
 
-  Status Get(
-      const ReadOptions&, const Slice& key, void* arg,
-      bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
-      void (*mark_key_may_exist)(void*) = nullptr) override;
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
+                                   const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr) override;
 
   uint64_t ApproximateOffsetOf(const Slice& key) override;
 
@@ -245,7 +245,8 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
       return s;
     }
 
-    int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
+    InternalKeyComparator ikc(rep_->options.comparator);
+    int compare_result = ikc.Compare(tmp_slice, target);
 
     if (compare_result < 0) {
       if (left == right) {
@@ -280,14 +281,20 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
   return s;
 }
 
-Status SimpleTableReader::Get(
-    const ReadOptions& options, const Slice& k, void* arg,
-    bool (*saver)(void*, const Slice&, const Slice&, bool),
-    void (*mark_key_may_exist)(void*)) {
+Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
+                              void* arg,
+                              bool (*saver)(void*, const ParsedInternalKey&,
+                                            const Slice&, bool),
+                              void (*mark_key_may_exist)(void*)) {
   Status s;
   SimpleTableIterator* iter = new SimpleTableIterator(this);
   for (iter->Seek(k); iter->Valid(); iter->Next()) {
-    if (!(*saver)(arg, iter->key(), iter->value(), true)) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter->key(), &parsed_key)) {
+      return Status::Corruption(Slice());
+    }
+
+    if (!(*saver)(arg, parsed_key, iter->value(), true)) {
       break;
     }
   }
@@ -537,15 +544,19 @@ public:
     return "SimpleTable";
   }
   Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_key,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const;
 
-  TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& internal_key,
+                                WritableFile* file,
                                 CompressionType compression_type) const;
 };
 
 Status SimpleTableFactory::NewTableReader(
     const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_key,
     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
 
@@ -554,8 +565,8 @@ Status SimpleTableFactory::NewTableReader(
 }
 
 TableBuilder* SimpleTableFactory::NewTableBuilder(
-    const Options& options, WritableFile* file,
-    CompressionType compression_type) const {
+    const Options& options, const InternalKeyComparator& internal_key,
+    WritableFile* file, CompressionType compression_type) const {
   return new SimpleTableBuilder(options, file, compression_type);
 }
 
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 6e48e1a90..591933cef 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -60,6 +60,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
 }
 
 Status TableCache::FindTable(const EnvOptions& toptions,
+                             const InternalKeyComparator& internal_comparator,
                              uint64_t file_number, uint64_t file_size,
                              Cache::Handle** handle, bool* table_io,
                              const bool no_io) {
@@ -84,7 +85,8 @@ Status TableCache::FindTable(const EnvOptions& toptions,
       }
       StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
       s = options_->table_factory->NewTableReader(
-          *options_, toptions, std::move(file), file_size, &table_reader);
+          *options_, toptions, internal_comparator, std::move(file), file_size,
+          &table_reader);
     }
 
     if (!s.ok()) {
@@ -102,6 +104,7 @@ Status TableCache::FindTable(const EnvOptions& toptions,
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                   const EnvOptions& toptions,
+                                  const InternalKeyComparator& icomparator,
                                   const FileMetaData& file_meta,
                                   TableReader** table_reader_ptr,
                                   bool for_compaction) {
@@ -111,8 +114,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
   Cache::Handle* handle = file_meta.table_reader_handle;
   Status s;
   if (!handle) {
-    s = FindTable(toptions, file_meta.number, file_meta.file_size, &handle,
-                  nullptr, options.read_tier == kBlockCacheTier);
+    s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
+                  &handle, nullptr, options.read_tier == kBlockCacheTier);
   }
   if (!s.ok()) {
     return NewErrorIterator(s);
@@ -135,17 +138,17 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 }
 
 Status TableCache::Get(const ReadOptions& options,
-                       const FileMetaData& file_meta,
-                       const Slice& k,
-                       void* arg,
-                       bool (*saver)(void*, const Slice&, const Slice&, bool),
-                       bool* table_io,
-                       void (*mark_key_may_exist)(void*)) {
+                       const InternalKeyComparator& internal_comparator,
+                       const FileMetaData& file_meta, const Slice& k, void* arg,
+                       bool (*saver)(void*, const ParsedInternalKey&,
+                                     const Slice&, bool),
+                       bool* table_io, void (*mark_key_may_exist)(void*)) {
   Cache::Handle* handle = file_meta.table_reader_handle;
   Status s;
   if (!handle) {
-    s = FindTable(storage_options_, file_meta.number, file_meta.file_size,
-                  &handle, table_io, options.read_tier == kBlockCacheTier);
+    s = FindTable(storage_options_, internal_comparator, file_meta.number,
+                  file_meta.file_size, &handle, table_io,
+                  options.read_tier == kBlockCacheTier);
   }
   if (s.ok()) {
     TableReader* t = GetTableReaderFromHandle(handle);
@@ -162,13 +165,12 @@ Status TableCache::Get(const ReadOptions& options,
 }
 
 bool TableCache::PrefixMayMatch(const ReadOptions& options,
-                                uint64_t file_number,
-                                uint64_t file_size,
-                                const Slice& internal_prefix,
-                                bool* table_io) {
+                                const InternalKeyComparator& icomparator,
+                                uint64_t file_number, uint64_t file_size,
+                                const Slice& internal_prefix, bool* table_io) {
   Cache::Handle* handle = nullptr;
-  Status s = FindTable(storage_options_, file_number,
-                       file_size, &handle, table_io);
+  Status s = FindTable(storage_options_, icomparator, file_number, file_size,
+                       &handle, table_io);
   bool may_match = true;
   if (s.ok()) {
     TableReader* t = GetTableReaderFromHandle(handle);
diff --git a/db/table_cache.h b/db/table_cache.h
index 665d3b901..316a31888 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -38,8 +38,8 @@ class TableCache {
   // the returned iterator.  The returned "*tableptr" object is owned by
   // the cache and should not be deleted, and is valid for as long as the
   // returned iterator is live.
-  Iterator* NewIterator(const ReadOptions& options,
-                        const EnvOptions& toptions,
+  Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
+                        const InternalKeyComparator& internal_comparator,
                         const FileMetaData& file_meta,
                         TableReader** table_reader_ptr = nullptr,
                         bool for_compaction = false);
@@ -48,26 +48,27 @@ class TableCache {
   // call (*handle_result)(arg, found_key, found_value) repeatedly until
   // it returns false.
   Status Get(const ReadOptions& options,
-             const FileMetaData& file_meta,
-             const Slice& k,
-             void* arg,
-             bool (*handle_result)(void*, const Slice&, const Slice&, bool),
-             bool* table_io,
-             void (*mark_key_may_exist)(void*) = nullptr);
+             const InternalKeyComparator& internal_comparator,
+             const FileMetaData& file_meta, const Slice& k, void* arg,
+             bool (*handle_result)(void*, const ParsedInternalKey&,
+                                   const Slice&, bool),
+             bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
 
   // Determine whether the table may contain the specified prefix.  If
   // the table index or blooms are not in memory, this may cause an I/O
-  bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
-                      uint64_t file_size, const Slice& internal_prefix,
-                      bool* table_io);
+  bool PrefixMayMatch(const ReadOptions& options,
+                      const InternalKeyComparator& internal_comparator,
+                      uint64_t file_number, uint64_t file_size,
+                      const Slice& internal_prefix, bool* table_io);
 
   // Evict any entry for the specified file number
   void Evict(uint64_t file_number);
 
   // Find table reader
-  Status FindTable(const EnvOptions& toptions, uint64_t file_number,
-                   uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
-                   const bool no_io = false);
+  Status FindTable(const EnvOptions& toptions,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_number, uint64_t file_size, Cache::Handle**,
+                   bool* table_io = nullptr, const bool no_io = false);
 
   // Get TableReader from a cache handle.
   TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 15cbe9213..961a7302b 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -83,13 +83,13 @@ class DumbLogger : public Logger {
 };
 
 // Utilities test functions
-void MakeBuilder(
-    const Options& options,
-    std::unique_ptr<FakeWritableFile>* writable,
-    std::unique_ptr<TableBuilder>* builder) {
+void MakeBuilder(const Options& options,
+                 const InternalKeyComparator& internal_comparator,
+                 std::unique_ptr<FakeWritableFile>* writable,
+                 std::unique_ptr<TableBuilder>* builder) {
   writable->reset(new FakeWritableFile);
   builder->reset(options.table_factory->NewTableBuilder(
-      options, writable->get(), options.compression));
+      options, internal_comparator, writable->get(), options.compression));
 }
 
 // Collects keys that starts with "A" in a table.
@@ -127,9 +127,8 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
 extern uint64_t kBlockBasedTableMagicNumber;
 extern uint64_t kPlainTableMagicNumber;
 void TestCustomizedTablePropertiesCollector(
-    uint64_t magic_number,
-    bool encode_as_internal,
-    const Options& options) {
+    uint64_t magic_number, bool encode_as_internal, const Options& options,
+    const InternalKeyComparator& internal_comparator) {
   // make sure the entries will be inserted with order.
   std::map<std::string, std::string> kvs = {
     {"About   ", "val5"},  // starts with 'A'
@@ -144,7 +143,7 @@ void TestCustomizedTablePropertiesCollector(
   // -- Step 1: build table
   std::unique_ptr<TableBuilder> builder;
   std::unique_ptr<FakeWritableFile> writable;
-  MakeBuilder(options, &writable, &builder);
+  MakeBuilder(options, internal_comparator, &writable, &builder);
 
   for (const auto& kv : kvs) {
     if (encode_as_internal) {
@@ -193,11 +192,9 @@ TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
       options.table_properties_collectors.resize(1);
       options.table_properties_collectors[0].reset(collector);
     }
-    TestCustomizedTablePropertiesCollector(
-        kBlockBasedTableMagicNumber,
-        encode_as_internal,
-        options
-    );
+    test::PlainInternalKeyComparator ikc(options.comparator);
+    TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
+                                           encode_as_internal, options, ikc);
   }
 
   // test plain table
@@ -206,9 +203,9 @@ TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
       std::make_shared<RegularKeysStartWithA>()
   );
   options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
-  TestCustomizedTablePropertiesCollector(
-      kPlainTableMagicNumber, true, options
-  );
+  test::PlainInternalKeyComparator ikc(options.comparator);
+  TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
+                                         ikc);
 }
 
 void TestInternalKeyPropertiesCollector(
@@ -228,6 +225,8 @@ void TestInternalKeyPropertiesCollector(
   std::unique_ptr<TableBuilder> builder;
   std::unique_ptr<FakeWritableFile> writable;
   Options options;
+  test::PlainInternalKeyComparator pikc(options.comparator);
+
   options.table_factory = table_factory;
   if (sanitized) {
     options.table_properties_collectors = {
@@ -239,12 +238,9 @@ void TestInternalKeyPropertiesCollector(
     // HACK: Set options.info_log to avoid writing log in
     // SanitizeOptions().
     options.info_log = std::make_shared<DumbLogger>();
-    options = SanitizeOptions(
-        "db",  // just a place holder
-        nullptr,  // with skip internal key comparator
-        nullptr,  // don't care filter policy
-        options
-    );
+    options = SanitizeOptions("db",            // just a place holder
+                              &pikc, nullptr,  // don't care filter policy
+                              options);
     options.comparator = comparator;
   } else {
     options.table_properties_collectors = {
@@ -252,7 +248,7 @@ void TestInternalKeyPropertiesCollector(
     };
   }
 
-  MakeBuilder(options, &writable, &builder);
+  MakeBuilder(options, pikc, &writable, &builder);
   for (const auto& k : keys) {
     builder->Add(k.Encode(), "val");
   }
diff --git a/db/version_set.cc b/db/version_set.cc
index f15a3a49a..20a81801d 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -191,11 +191,10 @@ class Version::LevelFileNumIterator : public Iterator {
   mutable char value_buf_[16];
 };
 
-static Iterator* GetFileIterator(void* arg,
-                                 const ReadOptions& options,
+static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
                                  const EnvOptions& soptions,
-                                 const Slice& file_value,
-                                 bool for_compaction) {
+                                 const InternalKeyComparator& icomparator,
+                                 const Slice& file_value, bool for_compaction) {
   TableCache* cache = reinterpret_cast<TableCache*>(arg);
   if (file_value.size() != 16) {
     return NewErrorIterator(
@@ -210,11 +209,9 @@ static Iterator* GetFileIterator(void* arg,
     }
     FileMetaData meta(DecodeFixed64(file_value.data()),
                       DecodeFixed64(file_value.data() + 8));
-    return cache->NewIterator(options.prefix ? options_copy : options,
-                              soptions,
-                              meta,
-                              nullptr /* don't need reference to table*/,
-                              for_compaction);
+    return cache->NewIterator(
+        options.prefix ? options_copy : options, soptions, icomparator, meta,
+        nullptr /* don't need reference to table*/, for_compaction);
   }
 }
 
@@ -234,10 +231,9 @@ bool Version::PrefixMayMatch(const ReadOptions& options,
     may_match = true;
   } else {
     may_match = vset_->table_cache_->PrefixMayMatch(
-                           options,
-                           DecodeFixed64(level_iter->value().data()),
-                           DecodeFixed64(level_iter->value().data() + 8),
-                           internal_prefix, nullptr);
+        options, vset_->icmp_, DecodeFixed64(level_iter->value().data()),
+        DecodeFixed64(level_iter->value().data() + 8), internal_prefix,
+        nullptr);
   }
   return may_match;
 }
@@ -255,8 +251,8 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
       return NewEmptyIterator();
     }
   }
-  return NewTwoLevelIterator(level_iter, &GetFileIterator,
-                             vset_->table_cache_, options, soptions);
+  return NewTwoLevelIterator(level_iter, &GetFileIterator, vset_->table_cache_,
+                             options, soptions, vset_->icmp_);
 }
 
 void Version::AddIterators(const ReadOptions& options,
@@ -265,7 +261,7 @@ void Version::AddIterators(const ReadOptions& options,
   // Merge all level zero files together since they may overlap
   for (const FileMetaData* file : files_[0]) {
     iters->push_back(vset_->table_cache_->NewIterator(options, soptions,
-                                                      *file));
+                                                      vset_->icmp_, *file));
   }
 
   // For levels > 0, we can use a concatenating iterator that sequentially
@@ -315,80 +311,73 @@ static void MarkKeyMayExist(void* arg) {
   }
 }
 
-static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
+static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool didIO) {
   Saver* s = reinterpret_cast<Saver*>(arg);
   MergeContext* merge_contex = s->merge_context;
   std::string merge_result;  // temporary area for merge results later
 
   assert(s != nullptr && merge_contex != nullptr);
 
-  ParsedInternalKey parsed_key;
   // TODO: didIO and Merge?
   s->didIO = didIO;
-  if (!ParseInternalKey(ikey, &parsed_key)) {
-    // TODO: what about corrupt during Merge?
-    s->state = kCorrupt;
-  } else {
-    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
-      // Key matches. Process it
-      switch (parsed_key.type) {
-        case kTypeValue:
-          if (kNotFound == s->state) {
-            s->state = kFound;
-            s->value->assign(v.data(), v.size());
-          } else if (kMerge == s->state) {
-            assert(s->merge_operator != nullptr);
-            s->state = kFound;
-            if (!s->merge_operator->FullMerge(s->user_key, &v,
-                                              merge_contex->GetOperands(),
-                                              s->value, s->logger)) {
-              RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-              s->state = kCorrupt;
-            }
-          } else {
-            assert(false);
+  if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+    // Key matches. Process it
+    switch (parsed_key.type) {
+      case kTypeValue:
+        if (kNotFound == s->state) {
+          s->state = kFound;
+          s->value->assign(v.data(), v.size());
+        } else if (kMerge == s->state) {
+          assert(s->merge_operator != nullptr);
+          s->state = kFound;
+          if (!s->merge_operator->FullMerge(s->user_key, &v,
+                                            merge_contex->GetOperands(),
+                                            s->value, s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            s->state = kCorrupt;
           }
-          return false;
+        } else {
+          assert(false);
+        }
+        return false;
 
-        case kTypeDeletion:
-          if (kNotFound == s->state) {
-            s->state = kDeleted;
-          } else if (kMerge == s->state) {
-            s->state = kFound;
+      case kTypeDeletion:
+        if (kNotFound == s->state) {
+          s->state = kDeleted;
+        } else if (kMerge == s->state) {
+          s->state = kFound;
           if (!s->merge_operator->FullMerge(s->user_key, nullptr,
                                             merge_contex->GetOperands(),
                                             s->value, s->logger)) {
-              RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-              s->state = kCorrupt;
-            }
-          } else {
-            assert(false);
-          }
-          return false;
-
-        case kTypeMerge:
-          assert(s->state == kNotFound || s->state == kMerge);
-          s->state = kMerge;
-          merge_contex->PushOperand(v);
-          while (merge_contex->GetNumOperands() >= 2) {
-            // Attempt to merge operands together via user associateive merge
-            if (s->merge_operator->PartialMerge(s->user_key,
-                                                merge_contex->GetOperand(0),
-                                                merge_contex->GetOperand(1),
-                                                &merge_result,
-                                                s->logger)) {
-              merge_contex->PushPartialMergeResult(merge_result);
-            } else {
-              // Associative merge returns false ==> stack the operands
-              break;
-            }
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            s->state = kCorrupt;
           }
-          return true;
-
-        case kTypeLogData:
+        } else {
           assert(false);
+        }
+        return false;
+
+      case kTypeMerge:
+        assert(s->state == kNotFound || s->state == kMerge);
+        s->state = kMerge;
+        merge_contex->PushOperand(v);
+        while (merge_contex->GetNumOperands() >= 2) {
+          // Attempt to merge operands together via user associateive merge
+          if (s->merge_operator->PartialMerge(
+                  s->user_key, merge_contex->GetOperand(0),
+                  merge_contex->GetOperand(1), &merge_result, s->logger)) {
+            merge_contex->PushPartialMergeResult(merge_result);
+          } else {
+            // Associative merge returns false ==> stack the operands
           break;
+          }
       }
+      return true;
+
+      default:
+        assert(false);
+        break;
     }
   }
 
@@ -521,8 +510,9 @@ void Version::Get(const ReadOptions& options,
       prev_file = f;
 #endif
       bool tableIO = false;
-      *status = vset_->table_cache_->Get(options, *f, ikey, &saver, SaveValue,
-                                         &tableIO, MarkKeyMayExist);
+      *status =
+          vset_->table_cache_->Get(options, vset_->icmp_, *f, ikey, &saver,
+                                   SaveValue, &tableIO, MarkKeyMayExist);
       // TODO: examine the behavior for corrupted key
       if (!status->ok()) {
         return;
@@ -1355,9 +1345,8 @@ class VersionSet::Builder {
       for (auto& file_meta : *(levels_[level].added_files)) {
         assert (!file_meta->table_reader_handle);
         bool table_io;
-        vset_->table_cache_->FindTable(vset_->storage_options_,
-                                       file_meta->number,
-                                       file_meta->file_size,
+        vset_->table_cache_->FindTable(vset_->storage_options_, vset_->icmp_,
+                                       file_meta->number, file_meta->file_size,
                                        &file_meta->table_reader_handle,
                                        &table_io, false);
       }
@@ -2069,8 +2058,9 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         // "ikey" falls in the range for this table.  Add the
         // approximate offset of "ikey" within the table.
         TableReader* table_reader_ptr;
-        Iterator* iter = table_cache_->NewIterator(
-            ReadOptions(), storage_options_, *(files[i]), &table_reader_ptr);
+        Iterator* iter =
+            table_cache_->NewIterator(ReadOptions(), storage_options_, icmp_,
+                                      *(files[i]), &table_reader_ptr);
         if (table_reader_ptr != nullptr) {
           result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
         }
@@ -2134,14 +2124,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
       if (c->level() + which == 0) {
         for (const auto& file : *c->inputs(which)) {
           list[num++] = table_cache_->NewIterator(
-              options, storage_options_compactions_, *file, nullptr,
+              options, storage_options_compactions_, icmp_, *file, nullptr,
               true /* for compaction */);
         }
       } else {
         // Create concatenating iterator for the files from this level
         list[num++] = NewTwoLevelIterator(
             new Version::LevelFileNumIterator(icmp_, c->inputs(which)),
-            &GetFileIterator, table_cache_, options, storage_options_,
+            &GetFileIterator, table_cache_, options, storage_options_, icmp_,
             true /* for compaction */);
       }
     }
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 931d8f3f5..d3454c343 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -57,7 +57,7 @@ static std::string PrintContents(WriteBatch* b) {
         state.append(")");
         count++;
         break;
-      case kTypeLogData:
+      default:
         assert(false);
         break;
     }
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 219f05630..61ff84c0e 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -34,6 +34,7 @@ class TablePropertiesCollector;
 class Slice;
 class SliceTransform;
 class Statistics;
+class InternalKeyComparator;
 
 using std::shared_ptr;
 
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index a9be3e572..1bdea049f 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -27,8 +27,6 @@
 
 namespace rocksdb {
 
-class TableFactory;
-
 // -- Block-based Table
 class FlushBlockPolicyFactory;
 
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index feda28c1a..e5f3bd4d2 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -21,6 +21,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
+#include "db/dbformat.h"
 #include "table/block_based_table_reader.h"
 #include "table/block.h"
 #include "table/block_builder.h"
@@ -52,6 +53,7 @@ extern const uint64_t kBlockBasedTableMagicNumber
 
 struct BlockBasedTableBuilder::Rep {
   Options options;
+  const InternalKeyComparator& internal_comparator;
   WritableFile* file;
   uint64_t offset = 0;
   Status status;
@@ -71,31 +73,30 @@ struct BlockBasedTableBuilder::Rep {
   std::string compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
 
-  Rep(const Options& opt,
-      WritableFile* f,
-      FlushBlockPolicyFactory* flush_block_policy_factory,
+  Rep(const Options& opt, const InternalKeyComparator& icomparator,
+      WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
       CompressionType compression_type)
       : options(opt),
+        internal_comparator(icomparator),
         file(f),
-        data_block(options),
+        data_block(options, &internal_comparator),
         // To avoid linear scan, we make the block_restart_interval to be `1`
         // in index block builder
-        index_block(1 /* block_restart_interval */, options.comparator),
+        index_block(1 /* block_restart_interval */, &internal_comparator),
         compression_type(compression_type),
-        filter_block(opt.filter_policy == nullptr ? nullptr
-                     : new FilterBlockBuilder(opt)),
+        filter_block(opt.filter_policy == nullptr
+                         ? nullptr
+                         : new FilterBlockBuilder(opt, &internal_comparator)),
         flush_block_policy(
-            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {
-  }
+            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {}
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const Options& options,
-    WritableFile* file,
-    FlushBlockPolicyFactory* flush_block_policy_factory,
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory,
     CompressionType compression_type)
-    : rep_(new Rep(options,
-                   file, flush_block_policy_factory, compression_type)) {
+    : rep_(new Rep(options, internal_comparator, file,
+                   flush_block_policy_factory, compression_type)) {
   if (rep_->filter_block != nullptr) {
     rep_->filter_block->StartBlock(0);
   }
@@ -118,7 +119,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
   assert(!r->closed);
   if (!ok()) return;
   if (r->props.num_entries > 0) {
-    assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
+    assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
   }
 
   auto should_flush = r->flush_block_policy->Update(key, value);
@@ -135,7 +136,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     // entries in the first block and < all entries in subsequent
     // blocks.
     if (ok()) {
-      r->options.comparator->FindShortestSeparator(&r->last_key, key);
+      r->internal_comparator.FindShortestSeparator(&r->last_key, key);
       std::string handle_encoding;
       r->pending_handle.EncodeTo(&handle_encoding);
       r->index_block.Add(r->last_key, Slice(handle_encoding));
@@ -339,7 +340,7 @@ Status BlockBasedTableBuilder::Finish() {
   // block, we will finish writing all index entries here and flush them
   // to storage after metaindex block is written.
   if (ok() && !empty_data_block) {
-    r->options.comparator->FindShortSuccessor(&r->last_key);
+    r->internal_comparator.FindShortSuccessor(&r->last_key);
 
     std::string handle_encoding;
     r->pending_handle.EncodeTo(&handle_encoding);
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
index 0752eb399..1c4be1f83 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@@ -26,6 +26,7 @@ class BlockBasedTableBuilder : public TableBuilder {
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish().
   BlockBasedTableBuilder(const Options& options,
+                         const InternalKeyComparator& internal_comparator,
                          WritableFile* file,
                          FlushBlockPolicyFactory* flush_block_policy_factory,
                          CompressionType compression_type);
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 3cf064867..6a4a64462 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -20,15 +20,17 @@ namespace rocksdb {
 
 Status BlockBasedTableFactory::NewTableReader(
     const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
   return BlockBasedTable::Open(options, soptions, table_options_,
-                               std::move(file), file_size, table_reader);
+                               internal_comparator, std::move(file), file_size,
+                               table_reader);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
-    const Options& options, WritableFile* file,
-    CompressionType compression_type) const {
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
   auto flush_block_policy_factory = 
     table_options_.flush_block_policy_factory.get();
 
@@ -45,11 +47,9 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
                                           options.block_size_deviation);
   }
 
-  auto table_builder =  new BlockBasedTableBuilder(
-      options,
-      file,
-      flush_block_policy_factory,
-      compression_type);
+  auto table_builder =
+      new BlockBasedTableBuilder(options, internal_comparator, file,
+                                 flush_block_policy_factory, compression_type);
 
   // Delete flush_block_policy_factory only when it's just created from the
   // options.
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index bdae45a87..2513b9f83 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -35,12 +35,13 @@ class BlockBasedTableFactory : public TableFactory {
   const char* Name() const override { return "BlockBasedTable"; }
 
   Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const override;
 
-  TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
-                                CompressionType compression_type)
-      const override;
+  TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const override;
 
  private:
   BlockBasedTableOptions table_options_;
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index da100fee9..f4dd5b2ec 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -39,12 +39,13 @@ const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
 using std::unique_ptr;
 
 struct BlockBasedTable::Rep {
-  Rep(const EnvOptions& storage_options) :
-    soptions(storage_options) {
-  }
+  Rep(const EnvOptions& storage_options,
+      const InternalKeyComparator& internal_comparator)
+      : soptions(storage_options), internal_comparator_(internal_comparator) {}
 
   Options options;
   const EnvOptions& soptions;
+  const InternalKeyComparator& internal_comparator_;
   Status status;
   unique_ptr<RandomAccessFile> file;
   char cache_key_prefix[kMaxCacheKeyPrefixSize];
@@ -225,6 +226,7 @@ Cache::Handle* GetFromBlockCache(
 
 Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
                              const BlockBasedTableOptions& table_options,
+                             const InternalKeyComparator& internal_comparator,
                              unique_ptr<RandomAccessFile>&& file,
                              uint64_t file_size,
                              unique_ptr<TableReader>* table_reader) {
@@ -236,7 +238,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
 
   // We've successfully read the footer and the index block: we're
   // ready to serve requests.
-  Rep* rep = new BlockBasedTable::Rep(soptions);
+  Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator);
   rep->options = options;
   rep->file = std::move(file);
   rep->metaindex_handle = footer.metaindex_handle();
@@ -661,7 +663,7 @@ Iterator* BlockBasedTable::BlockReader(void* arg,
 
   Iterator* iter;
   if (block != nullptr) {
-    iter = block->NewIterator(table->rep_->options.comparator);
+    iter = block->NewIterator(&(table->rep_->internal_comparator_));
     if (cache_handle != nullptr) {
       iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
     } else {
@@ -734,7 +736,7 @@ BlockBasedTable::GetFilter(bool no_io) const {
 // Get the iterator from the index block.
 Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
   if (rep_->index_block) {
-    return rep_->index_block->NewIterator(rep_->options.comparator);
+    return rep_->index_block->NewIterator(&(rep_->internal_comparator_));
   }
 
   // get index block from cache
@@ -755,7 +757,7 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
 
   Iterator* iter;
   if (entry.value != nullptr) {
-    iter = entry.value->NewIterator(rep_->options.comparator);
+    iter = entry.value->NewIterator(&(rep_->internal_comparator_));
     if (entry.cache_handle) {
       iter->RegisterCleanup(
           &ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle
@@ -769,9 +771,9 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
   return iter;
 }
 
-Iterator* BlockBasedTable::BlockReader(void* arg,
-                                       const ReadOptions& options,
+Iterator* BlockBasedTable::BlockReader(void* arg, const ReadOptions& options,
                                        const EnvOptions& soptions,
+                                       const InternalKeyComparator& icomparator,
                                        const Slice& index_value,
                                        bool for_compaction) {
   return BlockReader(arg, options, index_value, nullptr, for_compaction);
@@ -862,20 +864,15 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) {
     }
   }
 
-  return NewTwoLevelIterator(
-           IndexBlockReader(options),
-           &BlockBasedTable::BlockReader,
-           const_cast<BlockBasedTable*>(this),
-           options,
-           rep_->soptions
-         );
+  return NewTwoLevelIterator(IndexBlockReader(options),
+                             &BlockBasedTable::BlockReader,
+                             const_cast<BlockBasedTable*>(this), options,
+                             rep_->soptions, rep_->internal_comparator_);
 }
 
 Status BlockBasedTable::Get(
-    const ReadOptions& readOptions,
-    const Slice& key,
-    void* handle_context,
-    bool (*result_handler)(void* handle_context, const Slice& k,
+    const ReadOptions& readOptions, const Slice& key, void* handle_context,
+    bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
                            const Slice& v, bool didIO),
     void (*mark_key_may_exist_handler)(void* handle_context)) {
   Status s;
@@ -913,8 +910,13 @@ Status BlockBasedTable::Get(
 
       // Call the *saver function on each entry/block until it returns false
       for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
-        if (!(*result_handler)(handle_context, block_iter->key(),
-                               block_iter->value(), didIO)) {
+        ParsedInternalKey parsed_key;
+        if (!ParseInternalKey(block_iter->key(), &parsed_key)) {
+          s = Status::Corruption(Slice());
+        }
+
+        if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
+                               didIO)) {
           done = true;
           break;
         }
@@ -931,7 +933,8 @@ Status BlockBasedTable::Get(
   return s;
 }
 
-bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) {
+bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
+               bool didIO) {
   *reinterpret_cast<bool*>(arg) = didIO;
   return false;
 }
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index d540f65ad..58e5b0716 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -51,6 +51,7 @@ class BlockBasedTable : public TableReader {
   // *file must remain live while this Table is in use.
   static Status Open(const Options& db_options, const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
+                     const InternalKeyComparator& internal_key_comparator,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table_reader);
 
@@ -63,10 +64,11 @@ class BlockBasedTable : public TableReader {
 
   Status Get(const ReadOptions& readOptions, const Slice& key,
              void* handle_context,
-             bool (*result_handler)(void* handle_context, const Slice& k,
-                                    const Slice& v, bool didIO),
-             void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
-      override;
+             bool (*result_handler)(void* handle_context,
+                                    const ParsedInternalKey& k, const Slice& v,
+                                    bool didIO),
+             void (*mark_key_may_exist_handler)(void* handle_context) =
+                 nullptr) override;
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
@@ -97,8 +99,9 @@ class BlockBasedTable : public TableReader {
   bool compaction_optimized_;
 
   static Iterator* BlockReader(void*, const ReadOptions&,
-                               const EnvOptions& soptions, const Slice&,
-                               bool for_compaction);
+                               const EnvOptions& soptions,
+                               const InternalKeyComparator& icomparator,
+                               const Slice&, bool for_compaction);
 
   static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
                                bool* didIO, bool for_compaction = false);
diff --git a/table/block_builder.cc b/table/block_builder.cc
index 917601865..f812dbae7 100644
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@@ -36,6 +36,7 @@
 #include <algorithm>
 #include <assert.h>
 #include "rocksdb/comparator.h"
+#include "db/dbformat.h"
 #include "util/coding.h"
 
 namespace rocksdb {
@@ -51,9 +52,8 @@ BlockBuilder::BlockBuilder(int block_restart_interval,
   restarts_.push_back(0);       // First restart point is at offset 0
 }
 
-BlockBuilder::BlockBuilder(const Options& options)
-    : BlockBuilder(options.block_restart_interval, options.comparator) {
-}
+BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator)
+    : BlockBuilder(options.block_restart_interval, comparator) {}
 
 void BlockBuilder::Reset() {
   buffer_.clear();
diff --git a/table/block_builder.h b/table/block_builder.h
index 31faf19b8..ed2f290fd 100644
--- a/table/block_builder.h
+++ b/table/block_builder.h
@@ -21,7 +21,7 @@ class Comparator;
 class BlockBuilder {
  public:
   BlockBuilder(int block_builder, const Comparator* comparator);
-  explicit BlockBuilder(const Options& options);
+  explicit BlockBuilder(const Options& options, const Comparator* comparator);
 
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
diff --git a/table/block_test.cc b/table/block_test.cc
index 7f33e3a90..588ce6729 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -32,9 +32,12 @@ class BlockTest {};
 TEST(BlockTest, SimpleTest) {
   Random rnd(301);
   Options options = Options();
+  std::unique_ptr<InternalKeyComparator> ic;
+  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
+
   std::vector<std::string> keys;
   std::vector<std::string> values;
-  BlockBuilder builder(options);
+  BlockBuilder builder(options, ic.get());
   int num_records = 100000;
   char buf[10];
   char* p = &buf[0];
diff --git a/table/filter_block.cc b/table/filter_block.cc
index 82b6c6ee1..356096d0e 100644
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@@ -21,11 +21,12 @@ namespace rocksdb {
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;
 
-FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
-                 : policy_(opt.filter_policy),
-                   prefix_extractor_(opt.prefix_extractor),
-                   whole_key_filtering_(opt.whole_key_filtering),
-                   comparator_(opt.comparator){}
+FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
+                                       const Comparator* internal_comparator)
+    : policy_(opt.filter_policy),
+      prefix_extractor_(opt.prefix_extractor),
+      whole_key_filtering_(opt.whole_key_filtering),
+      comparator_(internal_comparator) {}
 
 void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
   uint64_t filter_index = (block_offset / kFilterBase);
diff --git a/table/filter_block.h b/table/filter_block.h
index e47f94653..da19d42e9 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -35,7 +35,8 @@ class FilterPolicy;
 //      (StartBlock AddKey*)* Finish
 class FilterBlockBuilder {
  public:
-  explicit FilterBlockBuilder(const Options& opt);
+  explicit FilterBlockBuilder(const Options& opt,
+                              const Comparator* internal_comparator);
 
   void StartBlock(uint64_t block_offset);
   void AddKey(const Slice& key);
diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc
index bc1a0d0ab..1703d59d1 100644
--- a/table/filter_block_test.cc
+++ b/table/filter_block_test.cc
@@ -55,7 +55,7 @@ class FilterBlockTest {
 };
 
 TEST(FilterBlockTest, EmptyBuilder) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
   Slice block = builder.Finish();
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
   FilterBlockReader reader(options_, block);
@@ -64,7 +64,7 @@ TEST(FilterBlockTest, EmptyBuilder) {
 }
 
 TEST(FilterBlockTest, SingleChunk) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
   builder.StartBlock(100);
   builder.AddKey("foo");
   builder.AddKey("bar");
@@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) {
 }
 
 TEST(FilterBlockTest, MultiChunk) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
 
   // First filter
   builder.StartBlock(0);
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 5c3252360..e33ac39f2 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -11,6 +11,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
+#include "table/plain_table_factory.h"
+#include "db/dbformat.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
@@ -67,20 +69,32 @@ PlainTableBuilder::~PlainTableBuilder() {
 }
 
 void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
-  assert(user_key_len_ == 0 || key.size() == user_key_len_ + 8);
+  size_t user_key_size = key.size() - 8;
+  assert(user_key_len_ == 0 || user_key_size == user_key_len_);
 
   if (!IsFixedLength()) {
     // Write key length
-    int key_size = key.size();
     key_size_str_.clear();
-    PutVarint32(&key_size_str_, key_size);
+    PutVarint32(&key_size_str_, user_key_size);
     file_->Append(key_size_str_);
     offset_ += key_size_str_.length();
   }
 
   // Write key
-  file_->Append(key);
-  offset_ += key.size();
+  ParsedInternalKey parsed_key;
+  if (!ParseInternalKey(key, &parsed_key)) {
+    status_ = Status::Corruption(Slice());
+    return;
+  }
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    file_->Append(Slice(key.data(), user_key_size));
+    char tmp_char = PlainTableFactory::kValueTypeSeqId0;
+    file_->Append(Slice(&tmp_char, 1));
+    offset_ += key.size() - 7;
+  } else {
+    file_->Append(key);
+    offset_ += key.size();
+  }
 
   // Write value length
   value_size_str_.clear();
@@ -105,9 +119,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
   );
 }
 
-Status PlainTableBuilder::status() const {
-  return Status::OK();
-}
+Status PlainTableBuilder::status() const { return status_; }
 
 Status PlainTableBuilder::Finish() {
   assert(!closed_);
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index 45ae71c64..c7ee8eb2f 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -6,6 +6,7 @@
 
 #include <memory>
 #include <stdint.h>
+#include "db/dbformat.h"
 #include "table/plain_table_builder.h"
 #include "table/plain_table_reader.h"
 #include "port/port.h"
@@ -14,16 +15,18 @@ namespace rocksdb {
 
 Status PlainTableFactory::NewTableReader(const Options& options,
                                          const EnvOptions& soptions,
+                                         const InternalKeyComparator& icomp,
                                          unique_ptr<RandomAccessFile>&& file,
                                          uint64_t file_size,
                                          unique_ptr<TableReader>* table) const {
-  return PlainTableReader::Open(options, soptions, std::move(file), file_size,
-                                table, bloom_bits_per_key_, hash_table_ratio_);
+  return PlainTableReader::Open(options, soptions, icomp, std::move(file),
+                                file_size, table, bloom_bits_per_key_,
+                                hash_table_ratio_);
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
-    const Options& options, WritableFile* file,
-    CompressionType compression_type) const {
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
   return new PlainTableBuilder(options, file, user_key_len_);
 }
 
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index 55680a3ec..88745ca1b 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -57,12 +57,16 @@ class PlainTableFactory : public TableFactory {
         hash_table_ratio_(hash_table_ratio) {}
   const char* Name() const override { return "PlainTable"; }
   Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table) const override;
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& icomparator,
+                                WritableFile* file,
+                                CompressionType compression_type) const
+      override;
 
-  TableBuilder* NewTableBuilder(const Options& options, WritableFile* file,
-                                CompressionType compression_type)
-      const override;
+  static const char kValueTypeSeqId0 = 0xFF;
 
  private:
   uint32_t user_key_len_;
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 5d769eea2..b07862bad 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -4,8 +4,7 @@
 
 #include "table/plain_table_reader.h"
 
-#include <unordered_map>
-#include <map>
+#include <string>
 
 #include "db/dbformat.h"
 
@@ -77,6 +76,7 @@ class PlainTableIterator : public Iterator {
   Slice key_;
   Slice value_;
   Status status_;
+  std::string tmp_str_;
   // No copying allowed
   PlainTableIterator(const PlainTableIterator&) = delete;
   void operator=(const Iterator&) = delete;
@@ -84,10 +84,12 @@ class PlainTableIterator : public Iterator {
 
 extern const uint64_t kPlainTableMagicNumber;
 PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
+                                   const InternalKeyComparator& icomparator,
                                    uint64_t file_size, int bloom_bits_per_key,
                                    double hash_table_ratio,
                                    const TableProperties& table_properties)
     : soptions_(storage_options),
+      internal_comparator_(icomparator),
       file_size_(file_size),
       kHashTableRatio(hash_table_ratio),
       kBloomBitsPerKey(bloom_bits_per_key),
@@ -103,6 +105,7 @@ PlainTableReader::~PlainTableReader() {
 
 Status PlainTableReader::Open(const Options& options,
                               const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
                               unique_ptr<RandomAccessFile>&& file,
                               uint64_t file_size,
                               unique_ptr<TableReader>* table_reader,
@@ -122,9 +125,9 @@ Status PlainTableReader::Open(const Options& options,
     return s;
   }
 
-  std::unique_ptr<PlainTableReader> new_reader(
-      new PlainTableReader(soptions, file_size, bloom_bits_per_key,
-                           hash_table_ratio, table_properties));
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      soptions, internal_comparator, file_size, bloom_bits_per_key,
+      hash_table_ratio, table_properties));
   new_reader->file_ = std::move(file);
   new_reader->options_ = options;
 
@@ -215,10 +218,10 @@ int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) {
   int num_prefixes = 0;
   while (pos < data_end_offset_) {
     uint32_t key_offset = pos;
-    Slice key_slice;
+    ParsedInternalKey key;
     Slice value_slice;
-    status_ = Next(pos, &key_slice, &value_slice, pos);
-    Slice key_prefix_slice = GetPrefix(key_slice);
+    status_ = Next(pos, &key, &value_slice, pos);
+    Slice key_prefix_slice = GetPrefix(key);
 
     if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
       ++num_prefixes;
@@ -413,7 +416,11 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
                                                               index_ptr + 4,
                                                               &upper_bound);
   uint32_t high = upper_bound;
-  Slice mid_key;
+  ParsedInternalKey mid_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
 
   // The key is between [low, high). Do a binary search between it.
   while (high - low > 1) {
@@ -424,8 +431,8 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
     if (!s.ok()) {
       return s;
     }
-    int cmp_result = options_.comparator->Compare(target, mid_key);
-    if (cmp_result > 0) {
+    int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+    if (cmp_result < 0) {
       low = mid;
     } else {
       if (cmp_result == 0) {
@@ -442,7 +449,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
   // Both of the key at the position low or low+1 could share the same
   // prefix as target. We need to rule out one of them to avoid to go
   // to the wrong prefix.
-  Slice low_key;
+  ParsedInternalKey low_key;
   size_t tmp;
   uint32_t low_key_offset = base_ptr[low];
   Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp);
@@ -465,31 +472,53 @@ bool PlainTableReader::MayHavePrefix(uint32_t hash) {
   return bloom_ == nullptr || bloom_->MayContainHash(hash);
 }
 
-Status PlainTableReader::ReadKey(const char* row_ptr, Slice* key,
+Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) {
+  return options_.prefix_extractor->Transform(target.user_key);
+}
+
+Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key,
                                  size_t& bytes_read) {
   const char* key_ptr = nullptr;
   bytes_read = 0;
-  size_t internal_key_size = 0;
+  size_t user_key_size = 0;
   if (IsFixedLength()) {
-    internal_key_size = GetFixedInternalKeyLength();
+    user_key_size = user_key_len_;
     key_ptr = row_ptr;
   } else {
-    uint32_t key_size = 0;
+    uint32_t tmp_size = 0;
     key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_,
-                             &key_size);
-    internal_key_size = (size_t)key_size;
+                             &tmp_size);
+    if (key_ptr == nullptr) {
+      return Status::Corruption("Unable to read the next key");
+    }
+    user_key_size = (size_t)tmp_size;
     bytes_read = key_ptr - row_ptr;
   }
-  if (row_ptr + internal_key_size >= file_data_.data() + data_end_offset_) {
+  if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
     return Status::Corruption("Unable to read the next key");
   }
-  *key = Slice(key_ptr, internal_key_size);
-  bytes_read += internal_key_size;
+
+  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    key->user_key = Slice(key_ptr, user_key_size);
+    key->sequence = 0;
+    key->type = kTypeValue;
+    bytes_read += user_key_size + 1;
+  } else {
+    if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
+      return Status::Corruption("Unable to read the next key");
+    }
+    if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
+      return Status::Corruption(Slice());
+    }
+    bytes_read += user_key_size + 8;
+  }
+
   return Status::OK();
 }
 
-Status PlainTableReader::Next(uint32_t offset, Slice* key, Slice* value,
-                              uint32_t& next_offset) {
+Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key,
+                              Slice* value, uint32_t& next_offset) {
   if (offset == data_end_offset_) {
     next_offset = data_end_offset_;
     return Status::OK();
@@ -518,10 +547,11 @@ Status PlainTableReader::Next(uint32_t offset, Slice* key, Slice* value,
   return Status::OK();
 }
 
-Status PlainTableReader::Get(
-    const ReadOptions& ro, const Slice& target, void* arg,
-    bool (*saver)(void*, const Slice&, const Slice&, bool),
-    void (*mark_key_may_exist)(void*)) {
+Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
+                             void* arg,
+                             bool (*saver)(void*, const ParsedInternalKey&,
+                                           const Slice&, bool),
+                             void (*mark_key_may_exist)(void*)) {
   // Check bloom filter first.
   Slice prefix_slice = GetPrefix(target);
   uint32_t prefix_hash = GetSliceHash(prefix_slice);
@@ -534,7 +564,12 @@ Status PlainTableReader::Get(
   if (!s.ok()) {
     return s;
   }
-  Slice found_key;
+  ParsedInternalKey found_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
+
   Slice found_value;
   while (offset < data_end_offset_) {
     Status s = Next(offset, &found_key, &found_value, offset);
@@ -549,9 +584,10 @@ Status PlainTableReader::Get(
       }
       prefix_match = true;
     }
-    if (options_.comparator->Compare(found_key, target) >= 0
-        && !(*saver)(arg, found_key, found_value, true)) {
-      break;
+    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+      if (!(*saver)(arg, found_key, found_value, true)) {
+        break;
+      }
     }
   }
   return Status::OK();
@@ -612,7 +648,7 @@ void PlainTableIterator::Seek(const Slice& target) {
         }
         prefix_match = true;
       }
-      if (table_->options_.comparator->Compare(key(), target) >= 0) {
+      if (table_->internal_comparator_.Compare(key(), target) >= 0) {
         break;
       }
     }
@@ -623,8 +659,19 @@ void PlainTableIterator::Seek(const Slice& target) {
 
 void PlainTableIterator::Next() {
   offset_ = next_offset_;
-  Slice tmp_slice;
-  status_ = table_->Next(next_offset_, &key_, &value_, next_offset_);
+  if (offset_ < table_->data_end_offset_) {
+    Slice tmp_slice;
+    ParsedInternalKey parsed_key;
+    status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_);
+    if (status_.ok()) {
+      // Make a copy in this case. TODO optimize.
+      tmp_str_.clear();
+      AppendInternalKey(&tmp_str_, parsed_key);
+      key_ = Slice(tmp_str_);
+    } else {
+      offset_ = next_offset_ = table_->data_end_offset_;
+    }
+  }
 }
 
 void PlainTableIterator::Prev() {
@@ -632,10 +679,12 @@ void PlainTableIterator::Prev() {
 }
 
 Slice PlainTableIterator::key() const {
+  assert(Valid());
   return key_;
 }
 
 Slice PlainTableIterator::value() const {
+  assert(Valid());
   return value_;
 }
 
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index d223a13d5..1abe4e35c 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -6,8 +6,10 @@
 #include <unordered_map>
 #include <memory>
 #include <vector>
+#include <string>
 #include <stdint.h>
 
+#include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
@@ -27,6 +29,7 @@ struct ReadOptions;
 class TableCache;
 class TableReader;
 class DynamicBloom;
+class InternalKeyComparator;
 
 using std::unique_ptr;
 using std::unordered_map;
@@ -43,6 +46,7 @@ extern const uint32_t kPlainTableVariableLength;
 class PlainTableReader: public TableReader {
  public:
   static Status Open(const Options& options, const EnvOptions& soptions,
+                     const InternalKeyComparator& internal_comparator,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio);
@@ -51,10 +55,10 @@ class PlainTableReader: public TableReader {
 
   Iterator* NewIterator(const ReadOptions&);
 
-  Status Get(
-      const ReadOptions&, const Slice& key, void* arg,
-      bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
-      void (*mark_key_may_exist)(void*) = nullptr);
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*result_handler)(void* arg, const ParsedInternalKey& k,
+                                    const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr);
 
   uint64_t ApproximateOffsetOf(const Slice& key);
 
@@ -62,8 +66,10 @@ class PlainTableReader: public TableReader {
 
   const TableProperties& GetTableProperties() { return table_properties_; }
 
-  PlainTableReader(const EnvOptions& storage_options, uint64_t file_size,
-                   int bloom_bits_per_key, double hash_table_ratio,
+  PlainTableReader(const EnvOptions& storage_options,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_size, int bloom_num_bits,
+                   double hash_table_ratio,
                    const TableProperties& table_properties);
   ~PlainTableReader();
 
@@ -77,6 +83,7 @@ class PlainTableReader: public TableReader {
 
   Options options_;
   const EnvOptions& soptions_;
+  const InternalKeyComparator internal_comparator_;
   Status status_;
   unique_ptr<RandomAccessFile> file_;
 
@@ -184,11 +191,13 @@ class PlainTableReader: public TableReader {
   // too.
   bool MayHavePrefix(uint32_t hash);
 
-  Status ReadKey(const char* row_ptr, Slice* key, size_t& bytes_read);
+  Status ReadKey(const char* row_ptr, ParsedInternalKey* key,
+                 size_t& bytes_read);
   // Read the key and value at offset to key and value.
   // tmp_slice is a tmp slice.
   // return next_offset as the offset for the next key.
-  Status Next(uint32_t offset, Slice* key, Slice* value, uint32_t& next_offset);
+  Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value,
+              uint32_t& next_offset);
   // Get file offset for key target.
   // return value prefix_matched is set to true if the offset is confirmed
   // for a key with the same prefix as target.
@@ -202,6 +211,8 @@ class PlainTableReader: public TableReader {
         Slice(target.data(), target.size() - 8));
   }
 
+  Slice GetPrefix(const ParsedInternalKey& target);
+
   // No copying allowed
   explicit PlainTableReader(const TableReader&) = delete;
   void operator=(const TableReader&) = delete;
diff --git a/table/table_factory.h b/table/table_factory.h
index d4b222657..f606a916a 100644
--- a/table/table_factory.h
+++ b/table/table_factory.h
@@ -53,6 +53,7 @@ class TableFactory {
   // table_reader is the output table reader
   virtual Status NewTableReader(
       const Options& options, const EnvOptions& soptions,
+      const InternalKeyComparator& internal_comparator,
       unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
       unique_ptr<TableReader>* table_reader) const = 0;
 
@@ -75,8 +76,8 @@ class TableFactory {
   // keep the file open and close the file after closing the table builder.
   // compression_type is the compression type to use in this table.
   virtual TableBuilder* NewTableBuilder(
-      const Options& options, WritableFile* file,
-      CompressionType compression_type) const = 0;
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const = 0;
 };
 
 }  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
index 983c998e7..681ce7233 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -12,6 +12,7 @@
 namespace rocksdb {
 
 class Iterator;
+class ParsedInternalKey;
 class Slice;
 struct ReadOptions;
 struct TableProperties;
@@ -62,7 +63,7 @@ class TableReader {
   // key is the key to search for
   virtual Status Get(
       const ReadOptions& readOptions, const Slice& key, void* handle_context,
-      bool (*result_handler)(void* handle_context, const Slice& k,
+      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
                              const Slice& v, bool didIO),
       void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
 };
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 88436c1f3..f746592fe 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -34,8 +34,8 @@ static std::string MakeKey(int i, int j, bool through_db) {
   return key.Encode().ToString();
 }
 
-static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v,
-                           bool didIO) {
+static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
+                           const Slice& v, bool didIO) {
   return false;
 }
 
@@ -237,6 +237,8 @@ int main(int argc, char** argv) {
   rocksdb::EnvOptions env_options;
   options.create_if_missing = true;
   options.compression = rocksdb::CompressionType::kNoCompression;
+  options.internal_comparator =
+      new rocksdb::InternalKeyComparator(options.comparator);
 
   if (FLAGS_plain_table) {
     options.allow_mmap_reads = true;
diff --git a/table/table_test.cc b/table/table_test.cc
index 39f341131..076f5eb07 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -183,8 +183,9 @@ class Constructor {
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
   void Finish(const Options& options,
-              std::vector<std::string>* keys,
-              KVMap* kvmap) {
+              const InternalKeyComparator& internal_comparator,
+              std::vector<std::string>* keys, KVMap* kvmap) {
+    last_internal_key_ = &internal_comparator;
     *kvmap = data_;
     keys->clear();
     for (KVMap::const_iterator it = data_.begin();
@@ -193,12 +194,14 @@ class Constructor {
       keys->push_back(it->first);
     }
     data_.clear();
-    Status s = FinishImpl(options, *kvmap);
+    Status s = FinishImpl(options, internal_comparator, *kvmap);
     ASSERT_TRUE(s.ok()) << s.ToString();
   }
 
   // Construct the data structure from the data in "data"
-  virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) = 0;
 
   virtual Iterator* NewIterator() const = 0;
 
@@ -206,6 +209,9 @@ class Constructor {
 
   virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
 
+ protected:
+  const InternalKeyComparator* last_internal_key_;
+
  private:
   KVMap data_;
 };
@@ -219,10 +225,12 @@ class BlockConstructor: public Constructor {
   ~BlockConstructor() {
     delete block_;
   }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     delete block_;
     block_ = nullptr;
-    BlockBuilder builder(options);
+    BlockBuilder builder(options, &internal_comparator);
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
@@ -298,12 +306,14 @@ class TableConstructor: public Constructor {
       : Constructor(cmp), convert_to_internal_key_(convert_to_internal_key) {}
   ~TableConstructor() { Reset(); }
 
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     Reset();
     sink_.reset(new StringSink());
     unique_ptr<TableBuilder> builder;
-    builder.reset(options.table_factory->NewTableBuilder(options, sink_.get(),
-                                                         options.compression));
+    builder.reset(options.table_factory->NewTableBuilder(
+        options, internal_comparator, sink_.get(), options.compression));
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
@@ -328,8 +338,8 @@ class TableConstructor: public Constructor {
     source_.reset(new StringSource(sink_->contents(), uniq_id_,
                                    options.allow_mmap_reads));
     return options.table_factory->NewTableReader(
-        options, soptions, std::move(source_), sink_->contents().size(),
-        &table_reader_);
+        options, soptions, internal_comparator, std::move(source_),
+        sink_->contents().size(), &table_reader_);
   }
 
   virtual Iterator* NewIterator() const {
@@ -350,8 +360,8 @@ class TableConstructor: public Constructor {
         new StringSource(sink_->contents(), uniq_id_,
                          options.allow_mmap_reads));
     return options.table_factory->NewTableReader(
-        options, soptions, std::move(source_), sink_->contents().size(),
-        &table_reader_);
+        options, soptions, *last_internal_key_, std::move(source_),
+        sink_->contents().size(), &table_reader_);
   }
 
   virtual TableReader* table_reader() {
@@ -393,7 +403,9 @@ class MemTableConstructor: public Constructor {
   ~MemTableConstructor() {
     delete memtable_->Unref();
   }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     delete memtable_->Unref();
     Options memtable_options;
     memtable_options.memtable_factory = table_factory_;
@@ -429,7 +441,9 @@ class DBConstructor: public Constructor {
   ~DBConstructor() {
     delete db_;
   }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     delete db_;
     db_ = nullptr;
     NewDB();
@@ -619,7 +633,10 @@ class Harness {
     if (args.reverse_compare) {
       options_.comparator = &reverse_key_comparator;
     }
-    internal_comparator_.reset(new InternalKeyComparator(options_.comparator));
+
+    internal_comparator_.reset(
+        new test::PlainInternalKeyComparator(options_.comparator));
+
     support_prev_ = true;
     only_support_prefix_seek_ = false;
     BlockBasedTableOptions table_options;
@@ -638,7 +655,8 @@ class Harness {
         options_.allow_mmap_reads = true;
         options_.table_factory.reset(new PlainTableFactory());
         constructor_ = new TableConstructor(options_.comparator, true);
-        options_.comparator = internal_comparator_.get();
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
         break;
       case PLAIN_TABLE_FULL_STR_PREFIX:
         support_prev_ = false;
@@ -647,7 +665,8 @@ class Harness {
         options_.allow_mmap_reads = true;
         options_.table_factory.reset(new PlainTableFactory());
         constructor_ = new TableConstructor(options_.comparator, true);
-        options_.comparator = internal_comparator_.get();
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
         break;
       case BLOCK_TEST:
         constructor_ = new BlockConstructor(options_.comparator);
@@ -672,7 +691,7 @@ class Harness {
   void Test(Random* rnd) {
     std::vector<std::string> keys;
     KVMap data;
-    constructor_->Finish(options_, &keys, &data);
+    constructor_->Finish(options_, *internal_comparator_, &keys, &data);
 
     TestForwardScan(keys, data);
     if (support_prev_) {
@@ -844,7 +863,7 @@ class Harness {
   Constructor* constructor_;
   bool support_prev_;
   bool only_support_prefix_seek_;
-  shared_ptr<Comparator> internal_comparator_;
+  shared_ptr<InternalKeyComparator> internal_comparator_;
   static std::unique_ptr<const SliceTransform> noop_transform;
   static std::unique_ptr<const SliceTransform> prefix_transform;
 };
@@ -866,9 +885,24 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
 }
 
 // Tests against all kinds of tables
-class GeneralTableTest {};
-class BlockBasedTableTest {};
-class PlainTableTest {};
+class TableTest {
+ public:
+  const InternalKeyComparator& GetPlainInternalComparator(
+      const Comparator* comp) {
+    if (!plain_internal_comparator) {
+      plain_internal_comparator.reset(
+          new test::PlainInternalKeyComparator(comp));
+    }
+    return *plain_internal_comparator;
+  }
+
+ private:
+  std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
+};
+
+class GeneralTableTest : public TableTest {};
+class BlockBasedTableTest : public TableTest {};
+class PlainTableTest : public TableTest {};
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
@@ -891,7 +925,8 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   options.compression = kNoCompression;
   options.block_restart_interval = 1;
 
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
 
   auto& props = c.table_reader()->GetTableProperties();
   ASSERT_EQ(kvmap.size(), props.num_entries);
@@ -905,7 +940,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
 
   // Verify data size.
-  BlockBuilder block_builder(options);
+  BlockBuilder block_builder(options, options.comparator);
   for (const auto& item : kvmap) {
     block_builder.Add(item.first, item.second);
   }
@@ -927,7 +962,8 @@ TEST(BlockBasedTableTest, FilterPolicyNameProperties) {
   );
   options.filter_policy = filter_policy.get();
 
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
   auto& props = c.table_reader()->GetTableProperties();
   ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
 }
@@ -968,7 +1004,8 @@ TEST(BlockBasedTableTest, IndexSizeStat) {
     options.compression = kNoCompression;
     options.block_restart_interval = 1;
 
-    c.Finish(options, &ks, &kvmap);
+    c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
+             &kvmap);
     auto index_size =
       c.table_reader()->GetTableProperties().index_size;
     ASSERT_GT(index_size, last_index_size);
@@ -992,7 +1029,8 @@ TEST(BlockBasedTableTest, NumBlockStat) {
 
   std::vector<std::string> ks;
   KVMap kvmap;
-  c.Finish(options, &ks, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
+           &kvmap);
   ASSERT_EQ(
       kvmap.size(),
       c.table_reader()->GetTableProperties().num_data_blocks
@@ -1055,7 +1093,8 @@ TEST(BlockBasedTableTest, BlockCacheTest) {
 
   TableConstructor c(BytewiseComparator());
   c.Add("key", "value");
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
 
   // -- PART 1: Open with regular block cache.
   // Since block_cache is disabled, no cache activities will be involved.
@@ -1179,6 +1218,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   // unique ID from the file.
 
   Options opt;
+  unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
   opt.block_size = 1024;
   opt.compression = kNoCompression;
   opt.block_cache =
@@ -1195,7 +1236,7 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
   KVMap kvmap;
-  c.Finish(opt, &keys, &kvmap);
+  c.Finish(opt, *ikc, &keys, &kvmap);
 
   unique_ptr<Iterator> iter(c.NewIterator());
   iter->SeekToFirst();
@@ -1217,11 +1258,14 @@ extern const uint64_t kPlainTableMagicNumber;
 TEST(PlainTableTest, BasicPlainTableProperties) {
   PlainTableFactory factory(8, 8, 0);
   StringSink sink;
+  Options options;
+  InternalKeyComparator ikc(options.comparator);
   std::unique_ptr<TableBuilder> builder(
-      factory.NewTableBuilder(Options(), &sink, kNoCompression));
+      factory.NewTableBuilder(options, ikc, &sink, kNoCompression));
 
   for (char c = 'a'; c <= 'z'; ++c) {
-    std::string key(16, c);
+    std::string key(8, c);
+    key.append("\1       ");  // PlainTable expects internal key structure
     std::string value(28, c + 42);
     builder->Add(key, value);
   }
@@ -1255,9 +1299,10 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
   std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
+  test::PlainInternalKeyComparator internal_comparator(options.comparator);
   options.block_size = 1024;
   options.compression = kNoCompression;
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, internal_comparator, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
@@ -1284,9 +1329,10 @@ static void DoCompressionTest(CompressionType comp) {
   std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
+  test::PlainInternalKeyComparator ikc(options.comparator);
   options.block_size = 1024;
   options.compression = comp;
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, ikc, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index ac2d8d3d9..65a58ad93 100644
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -20,18 +20,17 @@ namespace rocksdb {
 namespace {
 
 typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
-                                   const EnvOptions& soptions, const Slice&,
-                                   bool for_compaction);
+                                   const EnvOptions& soptions,
+                                   const InternalKeyComparator& icomparator,
+                                   const Slice&, bool for_compaction);
 
 class TwoLevelIterator: public Iterator {
  public:
-  TwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction);
+  TwoLevelIterator(Iterator* index_iter, BlockFunction block_function,
+                   void* arg, const ReadOptions& options,
+                   const EnvOptions& soptions,
+                   const InternalKeyComparator& internal_comparator,
+                   bool for_compaction);
 
   virtual ~TwoLevelIterator();
 
@@ -76,6 +75,7 @@ class TwoLevelIterator: public Iterator {
   void* arg_;
   const ReadOptions options_;
   const EnvOptions& soptions_;
+  const InternalKeyComparator& internal_comparator_;
   Status status_;
   IteratorWrapper index_iter_;
   IteratorWrapper data_iter_; // May be nullptr
@@ -86,20 +86,17 @@ class TwoLevelIterator: public Iterator {
 };
 
 TwoLevelIterator::TwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction)
+    Iterator* index_iter, BlockFunction block_function, void* arg,
+    const ReadOptions& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator, bool for_compaction)
     : block_function_(block_function),
       arg_(arg),
       options_(options),
       soptions_(soptions),
+      internal_comparator_(internal_comparator),
       index_iter_(index_iter),
       data_iter_(nullptr),
-      for_compaction_(for_compaction) {
-}
+      for_compaction_(for_compaction) {}
 
 TwoLevelIterator::~TwoLevelIterator() {
 }
@@ -181,8 +178,9 @@ void TwoLevelIterator::InitDataBlock() {
       // data_iter_ is already constructed with this iterator, so
       // no need to change anything
     } else {
-      Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
-                                          for_compaction_);
+      Iterator* iter =
+          (*block_function_)(arg_, options_, soptions_, internal_comparator_,
+                             handle, for_compaction_);
       data_block_handle_.assign(handle.data(), handle.size());
       SetDataIterator(iter);
     }
@@ -191,15 +189,14 @@ void TwoLevelIterator::InitDataBlock() {
 
 }  // namespace
 
-Iterator* NewTwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction) {
-  return new TwoLevelIterator(index_iter, block_function, arg,
-                              options, soptions, for_compaction);
+Iterator* NewTwoLevelIterator(Iterator* index_iter,
+                              BlockFunction block_function, void* arg,
+                              const ReadOptions& options,
+                              const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
+                              bool for_compaction) {
+  return new TwoLevelIterator(index_iter, block_function, arg, options,
+                              soptions, internal_comparator, for_compaction);
 }
 
 }  // namespace rocksdb
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
index 85aed3f14..d313dcb18 100644
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
@@ -14,6 +14,7 @@
 namespace rocksdb {
 
 struct ReadOptions;
+class InternalKeyComparator;
 
 // Return a new two level iterator.  A two-level iterator contains an
 // index iterator whose values point to a sequence of blocks where
@@ -27,14 +28,11 @@ struct ReadOptions;
 extern Iterator* NewTwoLevelIterator(
     Iterator* index_iter,
     Iterator* (*block_function)(
-        void* arg,
-        const ReadOptions& options,
-        const EnvOptions& soptions,
-        const Slice& index_value,
-        bool for_compaction),
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
+        void* arg, const ReadOptions& options, const EnvOptions& soptions,
+        const InternalKeyComparator& internal_comparator,
+        const Slice& index_value, bool for_compaction),
+    void* arg, const ReadOptions& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
     bool for_compaction = false);
 
 }  // namespace rocksdb
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 7eb339659..79b361841 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -71,7 +71,6 @@ SstFileReader::SstFileReader(const std::string& file_path,
 }
 
 Status SstFileReader::NewTableReader(const std::string& file_path) {
-  table_options_.comparator = &internal_comparator_;
   Status s = table_options_.env->NewRandomAccessFile(file_path, &file_,
                                                     soptions_);
   if (!s.ok()) {
@@ -81,7 +80,8 @@ Status SstFileReader::NewTableReader(const std::string& file_path) {
   table_options_.env->GetFileSize(file_path, &file_size);
   unique_ptr<TableFactory> table_factory;
   s = table_options_.table_factory->NewTableReader(
-      table_options_, soptions_, std::move(file_), file_size, &table_reader_);
+      table_options_, soptions_, internal_comparator_, std::move(file_),
+      file_size, &table_reader_);
   return s;
 }
 
diff --git a/util/testutil.h b/util/testutil.h
index c73210fec..4fc8c0f5b 100644
--- a/util/testutil.h
+++ b/util/testutil.h
@@ -8,6 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <string>
+#include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "util/random.h"
@@ -51,5 +53,28 @@ class ErrorEnv : public EnvWrapper {
   }
 };
 
+// An internal comparator that just forward comparing results from the
+// user comparator in it. Can be used to test entities that have no dependency
+// on internal key structure but consumes InternalKeyComparator, like
+// BlockBasedTable.
+class PlainInternalKeyComparator : public InternalKeyComparator {
+ public:
+  explicit PlainInternalKeyComparator(const Comparator* c)
+      : InternalKeyComparator(c) {}
+
+  virtual ~PlainInternalKeyComparator() {}
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    return user_comparator()->Compare(a, b);
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {
+    user_comparator()->FindShortestSeparator(start, limit);
+  }
+  virtual void FindShortSuccessor(std::string* key) const override {
+    user_comparator()->FindShortSuccessor(key);
+  }
+};
+
 }  // namespace test
 }  // namespace rocksdb

From d43ebd8c655b62ab8604d19dae4bd84a6d133f55 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Mon, 3 Feb 2014 19:48:45 -0800
Subject: [PATCH 68/70] Put table factory back to public api

Summary:
Previous I am too ambitious to hide every detail about table factory
to internal api. However, we cannot pass the compilatoin for external
users since we use table factory as the shared_ptr, which requires
the definition of table factory's destructor.

Test Plan: make check;

Reviewers: sdong, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15861
---
 db/builder.cc                     |  2 +-
 db/table_cache.h                  |  2 +-
 include/rocksdb/table.h           | 66 ++++++++++++++++++++++++
 table/block_based_table_factory.h |  1 -
 table/plain_table_factory.h       |  1 -
 table/table_factory.h             | 83 -------------------------------
 table/table_reader.h              |  2 +-
 7 files changed, 69 insertions(+), 88 deletions(-)
 delete mode 100644 table/table_factory.h

diff --git a/db/builder.cc b/db/builder.cc
index 96fb29eef..08e76b539 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -18,8 +18,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "table/block_based_table_builder.h"
-#include "table/table_factory.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/db/table_cache.h b/db/table_cache.h
index 316a31888..5ec0838cb 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -17,7 +17,7 @@
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
-#include "table/table_factory.h"
+#include "rocksdb/table.h"
 #include "table/table_reader.h"
 
 namespace rocksdb {
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 1bdea049f..d4965ca45 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -29,6 +29,14 @@ namespace rocksdb {
 
 // -- Block-based Table
 class FlushBlockPolicyFactory;
+class RandomAccessFile;
+class TableBuilder;
+class TableReader;
+class WritableFile;
+struct EnvOptions;
+struct Options;
+
+using std::unique_ptr;
 
 // For advanced user only
 struct BlockBasedTableOptions {
@@ -67,4 +75,62 @@ extern TableFactory* NewPlainTableFactory(
     uint32_t user_key_len = kPlainTableVariableLength,
     int bloom_bits_per_key = 10, double hash_table_ratio = 0.75);
 
+// A base class for table factories.
+class TableFactory {
+ public:
+  virtual ~TableFactory() {}
+
+  // The type of the table.
+  //
+  // The client of this package should switch to a new name whenever
+  // the table format implementation changes.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // NewTableReader() is called in two places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (1) SstFileReader (for SST Dump) opens the table and dump the table
+  //     contents using the interator of the table.
+  // options and soptions are options. options is the general options.
+  // Multiple configured can be accessed from there, including and not
+  // limited to block cache and key comparators.
+  // file is a file handler to handle the file for the table
+  // file_size is the physical file size of the file
+  // table_reader is the output table reader
+  virtual Status NewTableReader(
+      const Options& options, const EnvOptions& soptions,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // options is the general options. Multiple configured can be acceseed from
+  // there, including and not limited to compression options.
+  // file is a handle of a writable file. It is the caller's responsibility to
+  // keep the file open and close the file after closing the table builder.
+  // compression_type is the compression type to use in this table.
+  virtual TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const = 0;
+};
+
 }  // namespace rocksdb
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index 2513b9f83..556997065 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -14,7 +14,6 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/table_factory.h"
 
 namespace rocksdb {
 
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index 88745ca1b..382efe3c1 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -8,7 +8,6 @@
 
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/table_factory.h"
 
 namespace rocksdb {
 
diff --git a/table/table_factory.h b/table/table_factory.h
deleted file mode 100644
index f606a916a..000000000
--- a/table/table_factory.h
+++ /dev/null
@@ -1,83 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#pragma once
-
-#include <memory>
-#include "rocksdb/status.h"
-
-namespace rocksdb {
-
-using std::unique_ptr;
-
-class RandomAccessFile;
-class TableBuilder;
-class TableReader;
-class WritableFile;
-struct EnvOptions;
-struct Options;
-
-// A base class for table factories
-class TableFactory {
- public:
-  virtual ~TableFactory() {}
-
-  // The type of the table.
-  //
-  // The client of this package should switch to a new name whenever
-  // the table format implementation changes.
-  //
-  // Names starting with "rocksdb." are reserved and should not be used
-  // by any clients of this package.
-  virtual const char* Name() const = 0;
-
-  // Returns a Table object table that can fetch data from file specified
-  // in parameter file. It's the caller's responsibility to make sure
-  // file is in the correct format.
-  //
-  // NewTableReader() is called in two places:
-  // (1) TableCache::FindTable() calls the function when table cache miss
-  //     and cache the table object returned.
-  // (1) SstFileReader (for SST Dump) opens the table and dump the table
-  //     contents using the interator of the table.
-  // options and soptions are options. options is the general options.
-  // Multiple configured can be accessed from there, including and not
-  // limited to block cache and key comparators.
-  // file is a file handler to handle the file for the table
-  // file_size is the physical file size of the file
-  // table_reader is the output table reader
-  virtual Status NewTableReader(
-      const Options& options, const EnvOptions& soptions,
-      const InternalKeyComparator& internal_comparator,
-      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader) const = 0;
-
-  // Return a table builder to write to a file for this table type.
-  //
-  // It is called in several places:
-  // (1) When flushing memtable to a level-0 output file, it creates a table
-  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
-  // (2) During compaction, it gets the builder for writing compaction output
-  //     files in DBImpl::OpenCompactionOutputFile().
-  // (3) When recovering from transaction logs, it creates a table builder to
-  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
-  //     by calling BuildTable())
-  // (4) When running Repairer, it creates a table builder to convert logs to
-  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
-  //
-  // options is the general options. Multiple configured can be acceseed from
-  // there, including and not limited to compression options.
-  // file is a handle of a writable file. It is the caller's responsibility to
-  // keep the file open and close the file after closing the table builder.
-  // compression_type is the compression type to use in this table.
-  virtual TableBuilder* NewTableBuilder(
-      const Options& options, const InternalKeyComparator& internal_comparator,
-      WritableFile* file, CompressionType compression_type) const = 0;
-};
-
-}  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
index 681ce7233..9acbb33d0 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -12,7 +12,7 @@
 namespace rocksdb {
 
 class Iterator;
-class ParsedInternalKey;
+struct ParsedInternalKey;
 class Slice;
 struct ReadOptions;
 struct TableProperties;

From aa6fbbfae78bd6a08ef8e64ce0b69f2a365249fe Mon Sep 17 00:00:00 2001
From: lisyarus <lisyarus@gmail.com>
Date: Thu, 6 Feb 2014 01:34:18 +0400
Subject: [PATCH 69/70] Fix build in case zlib not found

---
 port/port_posix.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/port/port_posix.h b/port/port_posix.h
index 15ab0dc5b..8ff2480a3 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -396,7 +396,6 @@ inline char*  BZip2_Uncompress(const char* input_data, size_t input_length,
         _stream.next_out = (char *)(output + old_sz);
         _stream.avail_out = output_len - old_sz;
         break;
-      case Z_BUF_ERROR:
       default:
         delete[] output;
         BZ2_bzDecompressEnd(&_stream);

From fd0ffbc7ca929b76360e9dcb5f163af7666b6596 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Thu, 6 Feb 2014 00:11:18 -0800
Subject: [PATCH 70/70] Disable the html-based coverage report by default

---
 Makefile                  | 2 +-
 coverage/coverage_test.sh | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index bf03dcbf9..d1ea5092e 100644
--- a/Makefile
+++ b/Makefile
@@ -145,7 +145,7 @@ release:
 
 coverage:
 	$(MAKE) clean
-	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
+	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
 	(cd coverage; ./coverage_test.sh)
 	# Delete intermediate files
 	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh
index 7a8b5e0fe..08dbd05a5 100755
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@@ -44,6 +44,11 @@ $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
   tee -a $RECENT_REPORT &&
 echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
 
+# Unless otherwise specified, we'll not generate html report by default
+if [ -z "$HTML" ]; then
+  exit 0
+fi
+
 # Generate the html report. If we cannot find lcov in this machine, we'll simply
 # skip this step.
 echo "Generating the html coverage report..."