diff --git a/HISTORY.md b/HISTORY.md
index f7fac7b0d..0a69d12da 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,8 +6,12 @@
   executed in high priority thread pool.
 
 ## Unreleased (will be relased in 2.8)
-* By default, checksums are verified on every read from database
+## Unreleased
+
+### Public API changes
 
+* Removed arena.h from public header files.
+* By default, checksums are verified on every read from database
 
 ## 2.7.0 (01/28/2014)
 
diff --git a/Makefile b/Makefile
index 99892b761..c7cac9249 100644
--- a/Makefile
+++ b/Makefile
@@ -6,11 +6,7 @@
 INSTALL_PATH ?= $(CURDIR)
 
 #-----------------------------------------------
-# Uncomment exactly one of the lines labelled (A), (B), and (C) below
-# to switch between compilation modes.
-
-# OPT ?= -DNDEBUG     # (A) Production use (optimized mode)
-OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+OPT += -fno-omit-frame-pointer -momit-leaf-frame-pointer
 #-----------------------------------------------
 
 # detect what platform we're building on
@@ -57,6 +53,7 @@ TESTS = \
 	auto_roll_logger_test \
 	block_test \
 	bloom_test \
+	dynamic_bloom_test \
 	c_test \
 	cache_test \
 	coding_test \
@@ -75,6 +72,7 @@ TESTS = \
 	merge_test \
 	redis_test \
 	reduce_levels_test \
+	plain_table_db_test \
 	simple_table_db_test \
 	skiplist_test \
 	stringappend_test \
@@ -93,6 +91,7 @@ TOOLS = \
 	db_repl_stress \
 	blob_store_bench
 
+
 PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
 BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
 
@@ -143,11 +142,11 @@ all: $(LIBRARY) $(PROGRAMS)
 # Will also generate shared libraries. 
 release:
 	$(MAKE) clean
-	OPT=-DNDEBUG $(MAKE) all -j32
+	OPT="-DNDEBUG -O2" $(MAKE) all -j32
 
 coverage:
 	$(MAKE) clean
-	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
+	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
 	(cd coverage; ./coverage_test.sh)
 	# Delete intermediate files
 	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
@@ -248,6 +247,9 @@ table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJEC
 bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
@@ -278,11 +280,14 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
 table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
 
 perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index ceae38192..2d6062009 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -47,7 +47,6 @@ fi
 #     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
 #   fi
 # fi
-
 set -e
 
 uncommitted_code=`git diff HEAD`
@@ -55,7 +54,6 @@ uncommitted_code=`git diff HEAD`
 # If there's no uncommitted changes, we assume user are doing post-commit
 # format check, in which case we'll check the modified lines from latest commit.
 # Otherwise, we'll check format of the uncommitted code only.
-format_last_commit=0
 if [ -z "$uncommitted_code" ]
 then
   # Check the format of last commit
diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh
index 7a8b5e0fe..08dbd05a5 100755
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@@ -44,6 +44,11 @@ $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
   tee -a $RECENT_REPORT &&
 echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
 
+# Unless otherwise specified, we'll not generate html report by default
+if [ -z "$HTML" ]; then
+  exit 0
+fi
+
 # Generate the html report. If we cannot find lcov in this machine, we'll simply
 # skip this step.
 echo "Generating the html coverage report..."
diff --git a/db/builder.cc b/db/builder.cc
index 61671db0d..08e76b539 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -9,16 +9,16 @@
 
 #include "db/builder.h"
 
-#include "db/filename.h"
 #include "db/dbformat.h"
+#include "db/filename.h"
 #include "db/merge_helper.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "rocksdb/db.h"
-#include "rocksdb/table.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "table/block_based_table_builder.h"
 #include "util/stop_watch.h"
 
@@ -26,20 +26,18 @@ namespace rocksdb {
 
 class TableFactory;
 
-TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+TableBuilder* NewTableBuilder(const Options& options,
+                              const InternalKeyComparator& internal_comparator,
+                              WritableFile* file,
                               CompressionType compression_type) {
-  return options.table_factory->GetTableBuilder(options, file,
-                                                compression_type);
+  return options.table_factory->NewTableBuilder(options, internal_comparator,
+                                                file, compression_type);
 }
 
-Status BuildTable(const std::string& dbname,
-                  Env* env,
-                  const Options& options,
-                  const EnvOptions& soptions,
-                  TableCache* table_cache,
-                  Iterator* iter,
-                  FileMetaData* meta,
-                  const Comparator* user_comparator,
+Status BuildTable(const std::string& dbname, Env* env, const Options& options,
+                  const EnvOptions& soptions, TableCache* table_cache,
+                  Iterator* iter, FileMetaData* meta,
+                  const InternalKeyComparator& internal_comparator,
                   const SequenceNumber newest_snapshot,
                   const SequenceNumber earliest_seqno_in_memtable,
                   const CompressionType compression) {
@@ -64,8 +62,8 @@ Status BuildTable(const std::string& dbname,
       return s;
     }
 
-    TableBuilder* builder = GetTableBuilder(options, file.get(),
-                                            compression);
+    TableBuilder* builder =
+        NewTableBuilder(options, internal_comparator, file.get(), compression);
 
     // the first key is the smallest key
     Slice key = iter->key();
@@ -73,8 +71,8 @@ Status BuildTable(const std::string& dbname,
     meta->smallest_seqno = GetInternalKeySeqno(key);
     meta->largest_seqno = meta->smallest_seqno;
 
-    MergeHelper merge(user_comparator, options.merge_operator.get(),
-                      options.info_log.get(),
+    MergeHelper merge(internal_comparator.user_comparator(),
+                      options.merge_operator.get(), options.info_log.get(),
                       true /* internal key corruption is not ok */);
 
     if (purge) {
@@ -103,8 +101,8 @@ Status BuildTable(const std::string& dbname,
         // If the key is the same as the previous key (and it is not the
         // first key), then we skip it, since it is an older version.
         // Otherwise we output the key and mark it as the "new" previous key.
-        if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
-                                                       this_ikey.user_key)) {
+        if (!is_first_key && !internal_comparator.user_comparator()->Compare(
+                                  prev_ikey.user_key, this_ikey.user_key)) {
           // seqno within the same key are in decreasing order
           assert(this_ikey.sequence < prev_ikey.sequence);
         } else {
@@ -202,10 +200,8 @@ Status BuildTable(const std::string& dbname,
 
     if (s.ok()) {
       // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(),
-                                              soptions,
-                                              meta->number,
-                                              meta->file_size);
+      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
+                                              internal_comparator, *meta);
       s = it->status();
       delete it;
     }
diff --git a/db/builder.h b/db/builder.h
index 2600dc24b..630162968 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -24,23 +24,20 @@ class VersionEdit;
 class TableBuilder;
 class WritableFile;
 
-
-extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
-                                     CompressionType compression_type);
+extern TableBuilder* NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type);
 
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to meta->number.  On success, the rest of
 // *meta will be filled with metadata about the generated table.
 // If no data is present in *iter, meta->file_size will be set to
 // zero, and no Table file will be produced.
-extern Status BuildTable(const std::string& dbname,
-                         Env* env,
-                         const Options& options,
-                         const EnvOptions& soptions,
-                         TableCache* table_cache,
-                         Iterator* iter,
+extern Status BuildTable(const std::string& dbname, Env* env,
+                         const Options& options, const EnvOptions& soptions,
+                         TableCache* table_cache, Iterator* iter,
                          FileMetaData* meta,
-                         const Comparator* user_comparator,
+                         const InternalKeyComparator& internal_comparator,
                          const SequenceNumber newest_snapshot,
                          const SequenceNumber earliest_seqno_in_memtable,
                          const CompressionType compression);
diff --git a/db/column_family.cc b/db/column_family.cc
index 6f396f29f..ba8bd643f 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -17,6 +17,7 @@
 #include "db/internal_stats.h"
 #include "db/compaction_picker.h"
 #include "db/table_properties_collector.h"
+#include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
 
 namespace rocksdb {
@@ -184,7 +185,7 @@ ColumnFamilyData::~ColumnFamilyData() {
   if (mem_ != nullptr) {
     delete mem_->Unref();
   }
-  std::vector<MemTable*> to_delete;
+  autovector<MemTable*> to_delete;
   imm_.current()->Unref(&to_delete);
   for (MemTable* m : to_delete) {
     delete m;
diff --git a/db/column_family.h b/db/column_family.h
index 0aa97699a..999433add 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -16,7 +16,7 @@
 
 #include "rocksdb/options.h"
 #include "rocksdb/env.h"
-#include "db/memtablelist.h"
+#include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
 #include "db/table_cache.h"
 
@@ -40,7 +40,7 @@ struct SuperVersion {
   // We need to_delete because during Cleanup(), imm->Unref() returns
   // all memtables that we need to free through this vector. We then
   // delete all those memtables outside of mutex, during destruction
-  std::vector<MemTable*> to_delete;
+  autovector<MemTable*> to_delete;
 
   // should be called outside the mutex
   SuperVersion();
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 8355a3f0c..bdf842375 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -24,6 +24,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/perf_context.h"
 #include "port/port.h"
 #include "util/bit_set.h"
 #include "util/crc32c.h"
@@ -389,6 +390,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
 DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
              " this is greater than 0.");
 
+DEFINE_int32(perf_level, 0, "Level of perf collection");
+
 static bool ValidateRateLimit(const char* flagname, double value) {
   static constexpr double EPSILON = 1e-10;
   if ( value < -EPSILON ) {
@@ -728,6 +731,7 @@ struct SharedState {
   port::Mutex mu;
   port::CondVar cv;
   int total;
+  int perf_level;
 
   // Each thread goes through the following states:
   //    (1) initializing
@@ -739,7 +743,7 @@ struct SharedState {
   long num_done;
   bool start;
 
-  SharedState() : cv(&mu) { }
+  SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
 };
 
 // Per-thread state for concurrent executions of the same benchmark.
@@ -847,6 +851,7 @@ class Benchmark {
         fprintf(stdout, "Memtablerep: vector\n");
         break;
     }
+    fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
 
     PrintWarnings();
     fprintf(stdout, "------------------------------------------------\n");
@@ -1202,6 +1207,7 @@ class Benchmark {
       }
     }
 
+    SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
     thread->stats.Start(thread->tid);
     (arg->bm->*(arg->method))(thread);
     thread->stats.Stop();
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 91e327a8b..5e00b42a2 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -22,13 +22,13 @@
 #include <vector>
 
 #include "db/builder.h"
-#include "db/dbformat.h"
 #include "db/db_iter.h"
+#include "db/dbformat.h"
 #include "db/filename.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/memtable.h"
-#include "db/memtablelist.h"
+#include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/prefix_filter_iterator.h"
@@ -48,12 +48,13 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "port/port.h"
 #include "table/block.h"
 #include "table/block_based_table_factory.h"
 #include "table/merger.h"
+#include "table/table_builder.h"
 #include "table/two_level_iterator.h"
 #include "util/auto_roll_logger.h"
+#include "util/autovector.h"
 #include "util/build_version.h"
 #include "util/coding.h"
 #include "util/hash_skiplist_rep.h"
@@ -61,13 +62,12 @@
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
-#include "util/autovector.h"
 
 namespace rocksdb {
 
 const std::string default_column_family_name("default");
 
-void dumpLeveldbBuildVersion(Logger * log);
+void DumpLeveldbBuildVersion(Logger * log);
 
 // Information kept for every waiting writer
 struct DBImpl::Writer {
@@ -141,7 +141,10 @@ Options SanitizeOptions(const std::string& dbname,
 
 DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
   DBOptions result = src;
-  ClipToRange(&result.max_open_files, 20, 1000000);
+  // result.max_open_files means an "infinite" open files.
+  if (result.max_open_files != -1) {
+    ClipToRange(&result.max_open_files, 20, 1000000);
+  }
   if (result.max_background_flushes == 0) {
     result.max_background_flushes = 1;
   }
@@ -210,10 +213,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
       options_(SanitizeOptions(dbname, options)),
-      // Reserve ten files or so for other uses and give the rest to TableCache.
-      table_cache_(NewLRUCache(options_.max_open_files - 10,
-                               options_.table_cache_numshardbits,
-                               options_.table_cache_remove_scan_count_limit)),
       db_lock_(nullptr),
       mutex_(options.use_adaptive_mutex),
       shutting_down_(nullptr),
@@ -239,18 +238,27 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
 
   env_->GetAbsolutePath(dbname, &db_absolute_path_);
 
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  // Give a large number for setting of "infinite" open files.
+  const int table_cache_size =
+      (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10;
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  table_cache_ =
+      NewLRUCache(table_cache_size, options_.table_cache_numshardbits,
+                  options_.table_cache_remove_scan_count_limit);
+
   versions_.reset(
       new VersionSet(dbname_, &options_, storage_options_, table_cache_.get()));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
-  dumpLeveldbBuildVersion(options_.info_log.get());
+  DumpLeveldbBuildVersion(options_.info_log.get());
   // TODO(icanadi) dump DBOptions and ColumnFamilyOptions separately
   // options_.Dump(options_.info_log.get());
 
   char name[100];
-  Status st = env_->GetHostName(name, 100L);
-  if (st.ok()) {
+  Status s = env_->GetHostName(name, 100L);
+  if (s.ok()) {
     host_name_ = name;
   } else {
     Log(options_.info_log, "Can't get hostname, use localhost as host name.");
@@ -283,6 +291,10 @@ DBImpl::~DBImpl() {
     env_->UnlockFile(db_lock_);
   }
 
+  // versions need to be destroyed before table_cache since it can hold
+  // references to table_cache.
+  versions_.reset();
+
   LogFlush(options_.info_log);
 }
 
@@ -396,7 +408,7 @@ void DBImpl::MaybeDumpStats() {
 }
 
 // Returns the list of live files in 'sst_live' and the list
-// of all files in the filesystem in 'all_files'.
+// of all files in the filesystem in 'candidate_files'.
 // no_full_scan = true -- never do the full scan using GetChildren()
 // force = false -- don't force the full scan, except every
 //  options_.delete_obsolete_files_period_micros
@@ -448,15 +460,18 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
   versions_->AddLiveFiles(&deletion_state.sst_live);
 
   if (doing_the_full_scan) {
-    // set of all files in the directory
-    env_->GetChildren(dbname_, &deletion_state.all_files); // Ignore errors
+    // set of all files in the directory. We'll exclude files that are still
+    // alive in the subsequent processings.
+    env_->GetChildren(
+        dbname_, &deletion_state.candidate_files
+    ); // Ignore errors
 
     //Add log files in wal_dir
     if (options_.wal_dir != dbname_) {
       std::vector<std::string> log_files;
       env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
-      deletion_state.all_files.insert(
-        deletion_state.all_files.end(),
+      deletion_state.candidate_files.insert(
+        deletion_state.candidate_files.end(),
         log_files.begin(),
         log_files.end()
       );
@@ -469,11 +484,10 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
 // files in sst_delete_files and log_delete_files.
 // It is not necessary to hold the mutex when invoking this method.
 void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
-
   // check if there is anything to do
-  if (!state.all_files.size() &&
-      !state.sst_delete_files.size() &&
-      !state.log_delete_files.size()) {
+  if (state.candidate_files.empty() &&
+      state.sst_delete_files.empty() &&
+      state.log_delete_files.empty()) {
     return;
   }
 
@@ -483,100 +497,114 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
   if (state.manifest_file_number == 0) {
     return;
   }
-
-  uint64_t number;
-  FileType type;
   std::vector<std::string> old_log_files;
 
   // Now, convert live list to an unordered set, WITHOUT mutex held;
   // set is slow.
-  std::unordered_set<uint64_t> live_set(state.sst_live.begin(),
-                                        state.sst_live.end());
-
-  state.all_files.reserve(state.all_files.size() +
-      state.sst_delete_files.size());
+  std::unordered_set<uint64_t> sst_live(
+      state.sst_live.begin(), state.sst_live.end()
+  );
+
+  auto& candidate_files = state.candidate_files;
+  candidate_files.reserve(
+      candidate_files.size() +
+      state.sst_delete_files.size() +
+      state.log_delete_files.size());
+  // We may ignore the dbname when generating the file names.
+  const char* kDumbDbName = "";
   for (auto file : state.sst_delete_files) {
-    state.all_files.push_back(TableFileName("", file->number).substr(1));
+    candidate_files.push_back(
+        TableFileName(kDumbDbName, file->number).substr(1)
+    );
     delete file;
   }
 
-  state.all_files.reserve(state.all_files.size() +
-      state.log_delete_files.size());
-  for (auto filenum : state.log_delete_files) {
-    if (filenum > 0) {
-      state.all_files.push_back(LogFileName("", filenum).substr(1));
+  for (auto file_num : state.log_delete_files) {
+    if (file_num > 0) {
+      candidate_files.push_back(
+          LogFileName(kDumbDbName, file_num).substr(1)
+      );
     }
   }
 
-  // dedup state.all_files so we don't try to delete the same
+  // dedup state.candidate_files so we don't try to delete the same
   // file twice
-  sort(state.all_files.begin(), state.all_files.end());
-  auto unique_end = unique(state.all_files.begin(), state.all_files.end());
-
-  for (size_t i = 0; state.all_files.begin() + i < unique_end; i++) {
-    if (ParseFileName(state.all_files[i], &number, &type)) {
-      bool keep = true;
-      switch (type) {
-        case kLogFile:
-          keep = ((number >= state.log_number) ||
-                  (number == state.prev_log_number));
-          break;
-        case kDescriptorFile:
-          // Keep my manifest file, and any newer incarnations'
-          // (in case there is a race that allows other incarnations)
-          keep = (number >= state.manifest_file_number);
-          break;
-        case kTableFile:
-          keep = (live_set.find(number) != live_set.end());
-          break;
-        case kTempFile:
-          // Any temp files that are currently being written to must
-          // be recorded in pending_outputs_, which is inserted into "live"
-          keep = (live_set.find(number) != live_set.end());
-          break;
-        case kInfoLogFile:
-          keep = true;
-          if (number != 0) {
-            old_log_files.push_back(state.all_files[i]);
-          }
-          break;
-        case kCurrentFile:
-        case kDBLockFile:
-        case kIdentityFile:
-        case kMetaDatabase:
-          keep = true;
-          break;
-      }
+  sort(candidate_files.begin(), candidate_files.end());
+  candidate_files.erase(
+      unique(candidate_files.begin(), candidate_files.end()),
+      candidate_files.end()
+  );
+
+  for (const auto& to_delete : candidate_files) {
+    uint64_t number;
+    FileType type;
+    // Ignore file if we cannot recognize it.
+    if (!ParseFileName(to_delete, &number, &type)) {
+      continue;
+    }
 
-      if (!keep) {
-        if (type == kTableFile) {
-          // evict from cache
-          TableCache::Evict(table_cache_.get(), number);
+    bool keep = true;
+    switch (type) {
+      case kLogFile:
+        keep = ((number >= state.log_number) ||
+                (number == state.prev_log_number));
+        break;
+      case kDescriptorFile:
+        // Keep my manifest file, and any newer incarnations'
+        // (in case there is a race that allows other incarnations)
+        keep = (number >= state.manifest_file_number);
+        break;
+      case kTableFile:
+        keep = (sst_live.find(number) != sst_live.end());
+        break;
+      case kTempFile:
+        // Any temp files that are currently being written to must
+        // be recorded in pending_outputs_, which is inserted into "live"
+        keep = (sst_live.find(number) != sst_live.end());
+        break;
+      case kInfoLogFile:
+        keep = true;
+        if (number != 0) {
+          old_log_files.push_back(to_delete);
         }
-        std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
-            "/" + state.all_files[i];
+        break;
+      case kCurrentFile:
+      case kDBLockFile:
+      case kIdentityFile:
+      case kMetaDatabase:
+        keep = true;
+        break;
+    }
+
+    if (keep) {
+      continue;
+    }
+
+    if (type == kTableFile) {
+      // evict from cache
+      TableCache::Evict(table_cache_.get(), number);
+    }
+    std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
+        "/" + to_delete;
+    Log(options_.info_log,
+        "Delete type=%d #%lu",
+        int(type),
+        (unsigned long)number);
+
+    if (type == kLogFile &&
+        (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) {
+      Status s = env_->RenameFile(fname,
+          ArchivedLogFileName(options_.wal_dir, number));
+      if (!s.ok()) {
         Log(options_.info_log,
-            "Delete type=%d #%lu",
-            int(type),
-            (unsigned long)number);
-
-        Status st;
-        if (type == kLogFile && (options_.WAL_ttl_seconds > 0 ||
-              options_.WAL_size_limit_MB > 0)) {
-            st = env_->RenameFile(fname,
-                ArchivedLogFileName(options_.wal_dir, number));
-            if (!st.ok()) {
-              Log(options_.info_log,
-                  "RenameFile logfile #%lu FAILED -- %s\n",
-                  (unsigned long)number, st.ToString().c_str());
-            }
-        } else {
-          st = env_->DeleteFile(fname);
-          if (!st.ok()) {
-            Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n",
-                int(type), (unsigned long)number, st.ToString().c_str());
-          }
-        }
+            "RenameFile logfile #%lu FAILED -- %s\n",
+            (unsigned long)number, s.ToString().c_str());
+      }
+    } else {
+      Status s = env_->DeleteFile(fname);
+      if (!s.ok()) {
+        Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n",
+            int(type), (unsigned long)number, s.ToString().c_str());
       }
     }
   }
@@ -805,10 +833,11 @@ Status DBImpl::Recover(
     if (!s.ok()) {
       return s;
     }
-    uint64_t number;
-    FileType type;
+
     std::vector<uint64_t> logs;
     for (size_t i = 0; i < filenames.size(); i++) {
+      uint64_t number;
+      FileType type;
       if (ParseFileName(filenames[i], &number, &type)
           && type == kLogFile
           && ((number >= min_log) || (number == prev_log))) {
@@ -824,12 +853,12 @@ Status DBImpl::Recover(
 
     // Recover in the order in which the logs were generated
     std::sort(logs.begin(), logs.end());
-    for (size_t i = 0; s.ok() && i < logs.size(); i++) {
+    for (const auto& log : logs) {
       // The previous incarnation may not have written any MANIFEST
       // records after allocating this log number.  So we manually
       // update the file number allocation counter in VersionSet.
-      versions_->MarkFileNumberUsed(logs[i]);
-      s = RecoverLogFile(logs[i], &max_sequence, read_only);
+      versions_->MarkFileNumberUsed(log);
+      s = RecoverLogFile(log, &max_sequence, read_only);
     }
 
     if (s.ok()) {
@@ -1011,7 +1040,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
   {
     mutex_.Unlock();
     s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_,
-                   cfd->table_cache(), iter, &meta, cfd->user_comparator(),
+                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
                    newest_snapshot, earliest_seqno_in_memtable,
                    GetCompressionFlush(*cfd->full_options()));
     LogFlush(options_.info_log);
@@ -1045,7 +1074,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
 }
 
 Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
-                                std::vector<MemTable*>& mems, VersionEdit* edit,
+                                autovector<MemTable*>& mems, VersionEdit* edit,
                                 uint64_t* filenumber) {
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
@@ -1062,21 +1091,20 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
   Status s;
   {
     mutex_.Unlock();
-    std::vector<Iterator*> list;
+    std::vector<Iterator*> memtables;
     for (MemTable* m : mems) {
       Log(options_.info_log,
           "Flushing memtable with log file: %lu\n",
           (unsigned long)m->GetLogNumber());
-      list.push_back(m->NewIterator());
+      memtables.push_back(m->NewIterator());
     }
-    Iterator* iter =
-        NewMergingIterator(&cfd->internal_comparator(), &list[0], list.size());
-    Log(options_.info_log,
-        "Level-0 flush table #%lu: started",
+    Iterator* iter = NewMergingIterator(env_, &cfd->internal_comparator(),
+                                        &memtables[0], memtables.size());
+    Log(options_.info_log, "Level-0 flush table #%lu: started",
         (unsigned long)meta.number);
 
     s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_,
-                   cfd->table_cache(), iter, &meta, cfd->user_comparator(),
+                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
                    newest_snapshot, earliest_seqno_in_memtable,
                    GetCompressionFlush(*cfd->full_options()));
     LogFlush(options_.info_log);
@@ -1092,7 +1120,6 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
   }
   base->Unref();
 
-
   // re-acquire the most current version
   base = cfd->current();
 
@@ -1145,7 +1172,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
 
   // Save the contents of the earliest memtable as a new Table
   uint64_t file_number;
-  std::vector<MemTable*> mems;
+  autovector<MemTable*> mems;
   cfd->imm()->PickMemtablesToFlush(&mems);
   if (mems.empty()) {
     Log(options_.info_log, "Nothing in memstore to flush");
@@ -1763,8 +1790,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
 
 void DBImpl::BackgroundCallFlush() {
   bool madeProgress = false;
-  DeletionState deletion_state(default_cfd_->options()->max_write_buffer_number,
-                               true);
+  DeletionState deletion_state(true);
   assert(bg_flush_scheduled_);
   MutexLock l(&mutex_);
 
@@ -1815,8 +1841,7 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() {
 
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
-  DeletionState deletion_state(default_cfd_->options()->max_write_buffer_number,
-                               true);
+  DeletionState deletion_state(true);
 
   MaybeDumpStats();
 
@@ -2077,8 +2102,9 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
         *cfd->full_options(), compact->compaction->output_level(),
         compact->compaction->enable_compression());
 
-    compact->builder.reset(GetTableBuilder(
-        *cfd->full_options(), compact->outfile.get(), compression_type));
+    compact->builder.reset(
+        NewTableBuilder(*cfd->full_options(), cfd->internal_comparator(),
+                        compact->outfile.get(), compression_type));
   }
   LogFlush(options_.info_log);
   return s;
@@ -2126,8 +2152,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   if (s.ok() && current_entries > 0) {
     // Verify that the table is usable
     ColumnFamilyData* cfd = compact->compaction->column_family_data();
+    FileMetaData meta(output_number, current_bytes);
     Iterator* iter = cfd->table_cache()->NewIterator(
-        ReadOptions(), storage_options_, output_number, current_bytes);
+        ReadOptions(), storage_options_, cfd->internal_comparator(), meta);
     s = iter->status();
     delete iter;
     if (s.ok()) {
@@ -2641,8 +2668,9 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   // Collect iterators for files in L0 - Ln
   super_version->current->AddIterators(options, storage_options_,
                                        &iterator_list);
-  Iterator* internal_iter = NewMergingIterator(
-      &cfd->internal_comparator(), &iterator_list[0], iterator_list.size());
+  Iterator* internal_iter =
+      NewMergingIterator(env_, &cfd->internal_comparator(), &iterator_list[0],
+                         iterator_list.size());
 
   IterState* cleanup = new IterState(this, &mutex_, super_version);
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
@@ -2677,8 +2705,8 @@ std::pair<Iterator*, Iterator*> DBImpl::GetTailingIteratorPair(
   std::vector<Iterator*> list;
   super_version->imm->AddIterators(options, &list);
   super_version->current->AddIterators(options, storage_options_, &list);
-  Iterator* immutable_iter =
-      NewMergingIterator(&cfd->internal_comparator(), &list[0], list.size());
+  Iterator* immutable_iter = NewMergingIterator(
+      env_, &cfd->internal_comparator(), &list[0], list.size());
 
   // create a DBIter that only uses memtable content; see NewIterator()
   immutable_iter =
@@ -2739,6 +2767,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
                        const Slice& key, std::string* value,
                        bool* value_found) {
   StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
+  StopWatchNano snapshot_timer(env_, false);
+  StartPerfTimer(&snapshot_timer);
 
   mutex_.Lock();
   auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family.id);
@@ -2766,6 +2796,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
   LookupKey lkey(key, snapshot);
+  BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
   if (get_version->mem->Get(lkey, value, &s, merge_context,
                             *cfd->full_options())) {
     // Done
@@ -2775,12 +2806,19 @@ Status DBImpl::GetImpl(const ReadOptions& options,
     // Done
     RecordTick(options_.statistics.get(), MEMTABLE_HIT);
   } else {
+    StopWatchNano from_files_timer(env_, false);
+    StartPerfTimer(&from_files_timer);
+
     get_version->current->Get(options, lkey, value, &s, &merge_context, &stats,
                               *cfd->full_options(), value_found);
     have_stat_update = true;
+    BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
     RecordTick(options_.statistics.get(), MEMTABLE_MISS);
   }
 
+  StopWatchNano post_process_timer(env_, false);
+  StartPerfTimer(&post_process_timer);
+
   bool delete_get_version = false;
   if (!cfd->options()->disable_seek_compaction && have_stat_update) {
     mutex_.Lock();
@@ -2805,8 +2843,10 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   }
 
   // Note, tickers are atomic now - no lock protection needed any more.
+
   RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
   RecordTick(options_.statistics.get(), BYTES_READ, value->size());
+  BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
   return s;
 }
 
@@ -2816,6 +2856,9 @@ std::vector<Status> DBImpl::MultiGet(
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
 
   StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
+  StopWatchNano snapshot_timer(env_, false);
+  StartPerfTimer(&snapshot_timer);
+
   SequenceNumber snapshot;
 
   struct MultiGetColumnFamilyData {
@@ -2856,6 +2899,7 @@ std::vector<Status> DBImpl::MultiGet(
 
   // Keep track of bytes that we read for statistics-recording later
   uint64_t bytes_read = 0;
+  BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
 
   // For each of the given keys, apply the entire "get" process as follows:
   // First look in the memtable, then in the immutable memtable (if any).
@@ -2889,6 +2933,9 @@ std::vector<Status> DBImpl::MultiGet(
     }
   }
 
+  // Post processing (decrement reference counts and record statistics)
+  StopWatchNano post_process_timer(env_, false);
+  StartPerfTimer(&post_process_timer);
   autovector<SuperVersion*> superversions_to_delete;
 
   bool schedule_flush_or_compaction = false;
@@ -2921,6 +2968,7 @@ std::vector<Status> DBImpl::MultiGet(
   RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
   RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys);
   RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
 
   return stat_list;
 }
@@ -3080,6 +3128,8 @@ Status DBImpl::Delete(const WriteOptions& options,
 }
 
 Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+  StopWatchNano pre_post_process_timer(env_, false);
+  StartPerfTimer(&pre_post_process_timer);
   Writer w(&mutex_);
   w.batch = my_batch;
   w.sync = options.sync;
@@ -3148,6 +3198,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
       if (options.disableWAL) {
         flush_on_destroy_ = true;
       }
+      BumpPerfTime(&perf_context.write_pre_and_post_process_time,
+                   &pre_post_process_timer);
 
       if (!options.disableWAL) {
         StopWatchNano timer(env_);
@@ -3156,7 +3208,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         status = log_->AddRecord(log_entry);
         RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1);
         RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size());
-        BumpPerfTime(&perf_context.wal_write_time, &timer);
         if (status.ok() && options.sync) {
           if (options_.use_fsync) {
             StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
@@ -3166,12 +3217,17 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
             status = log_->file()->Sync();
           }
         }
+        BumpPerfTime(&perf_context.write_wal_time, &timer);
       }
       if (status.ok()) {
+        StopWatchNano write_memtable_timer(env_, false);
+
         // reading the column family set outside of DB mutex -- should lock
         versions_->GetColumnFamilySet()->Lock();
+        StartPerfTimer(&write_memtable_timer);
         status = WriteBatchInternal::InsertInto(
             updates, column_family_memtables_.get(), 0, this, false);
+        BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer);
         versions_->GetColumnFamilySet()->Unlock();
 
         if (!status.ok()) {
@@ -3184,6 +3240,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
                        last_sequence);
       }
+      StartPerfTimer(&pre_post_process_timer);
       if (updates == &tmp_batch_) tmp_batch_.Clear();
       mutex_.Lock();
       if (status.ok()) {
@@ -3211,6 +3268,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     writers_.front()->cv.Signal();
   }
   mutex_.Unlock();
+  BumpPerfTime(&perf_context.write_pre_and_post_process_time,
+               &pre_post_process_timer);
   return status;
 }
 
@@ -3420,7 +3479,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
 
     } else {
       unique_ptr<WritableFile> lfile;
-      MemTable* memtmp = nullptr;
+      MemTable* new_mem = nullptr;
 
       // Attempt to switch to a new memtable and trigger compaction of old.
       // Do this without holding the dbmutex lock.
@@ -3439,7 +3498,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
           // (compression, etc) but err on the side of caution.
           lfile->SetPreallocationBlockSize(1.1 *
                                            cfd->options()->write_buffer_size);
-          memtmp = new MemTable(cfd->internal_comparator(), *cfd->options());
+          new_mem = new MemTable(cfd->internal_comparator(), *cfd->options());
           new_superversion = new SuperVersion();
         }
       }
@@ -3447,7 +3506,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
       if (!s.ok()) {
         // Avoid chewing through file number space in a tight loop.
         versions_->ReuseFileNumber(new_log_number);
-        assert (!memtmp);
+        assert (!new_mem);
         break;
       }
       logfile_number_ = new_log_number;
@@ -3457,12 +3516,12 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
       if (force) {
         cfd->imm()->FlushRequested();
       }
-      memtmp->Ref();
-      memtmp->SetLogNumber(logfile_number_);
-      cfd->SetMemtable(memtmp);
+      new_mem->Ref();
+      new_mem->SetLogNumber(logfile_number_);
+      cfd->SetMemtable(new_mem);
       Log(options_.info_log, "New memtable created with log file: #%lu\n",
           (unsigned long)logfile_number_);
-      force = false;   // Do not force another compaction if have room
+      force = false;  // Do not force another compaction if have room
       MaybeScheduleFlushOrCompaction();
       delete cfd->InstallSuperVersion(new_superversion);
     }
@@ -3552,10 +3611,10 @@ Status DBImpl::DeleteFile(std::string name) {
   }
 
   int level;
-  FileMetaData metadata;
+  FileMetaData *metadata;
   ColumnFamilyData* cfd;
   VersionEdit edit;
-  DeletionState deletion_state(0, true);
+  DeletionState deletion_state(true);
   {
     MutexLock l(&mutex_);
     status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
@@ -3567,7 +3626,7 @@ Status DBImpl::DeleteFile(std::string name) {
     assert((level > 0) && (level < cfd->NumberLevels()));
 
     // If the file is being compacted no need to delete.
-    if (metadata.being_compacted) {
+    if (metadata->being_compacted) {
       Log(options_.info_log,
           "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
       return Status::OK();
@@ -3866,7 +3925,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
 
 //
 // A global method that can dump out the build version
-void dumpLeveldbBuildVersion(Logger * log) {
+void DumpLeveldbBuildVersion(Logger * log) {
   Log(log, "Git sha %s", rocksdb_build_git_sha);
   Log(log, "Compile time %s %s",
       rocksdb_build_compile_time, rocksdb_build_compile_date);
diff --git a/db/db_impl.h b/db/db_impl.h
index 1d117599c..bb32ea046 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -7,24 +7,26 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
+
 #include <atomic>
 #include <deque>
 #include <set>
 #include <utility>
 #include <vector>
+
 #include "db/dbformat.h"
 #include "db/log_writer.h"
 #include "db/snapshot.h"
 #include "db/column_family.h"
 #include "db/version_edit.h"
+#include "memtable_list.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
-#include "port/port.h"
-#include "util/stats_logger.h"
-#include "memtablelist.h"
 #include "util/autovector.h"
+#include "util/stats_logger.h"
 #include "db/internal_stats.h"
 
 namespace rocksdb {
@@ -178,7 +180,7 @@ class DBImpl : public DB {
   // needed for CleanupIteratorState
   struct DeletionState {
     inline bool HaveSomethingToDelete() const {
-      return  all_files.size() ||
+      return  candidate_files.size() ||
         sst_delete_files.size() ||
         log_delete_files.size();
     }
@@ -186,7 +188,7 @@ class DBImpl : public DB {
     // a list of all files that we'll consider deleting
     // (every once in a while this is filled up with all files
     // in the DB directory)
-    std::vector<std::string> all_files;
+    std::vector<std::string> candidate_files;
 
     // the list of all live sst files that cannot be deleted
     std::vector<uint64_t> sst_live;
@@ -198,7 +200,7 @@ class DBImpl : public DB {
     std::vector<uint64_t> log_delete_files;
 
     // a list of memtables to be free
-    std::vector<MemTable *> memtables_to_free;
+    autovector<MemTable*> memtables_to_free;
 
     SuperVersion* superversion_to_free;  // if nullptr nothing to free
 
@@ -208,12 +210,10 @@ class DBImpl : public DB {
     // that corresponds to the set of files in 'live'.
     uint64_t manifest_file_number, log_number, prev_log_number;
 
-    explicit DeletionState(const int num_memtables = 0,
-                           bool create_superversion = false) {
+    explicit DeletionState(bool create_superversion = false) {
       manifest_file_number = 0;
       log_number = 0;
       prev_log_number = 0;
-      memtables_to_free.reserve(num_memtables);
       superversion_to_free = nullptr;
       new_superversion = create_superversion ? new SuperVersion() : nullptr;
     }
@@ -232,7 +232,7 @@ class DBImpl : public DB {
   };
 
   // Returns the list of live files in 'live' and the list
-  // of all files in the filesystem in 'all_files'.
+  // of all files in the filesystem in 'candidate_files'.
   // If force == false and the last call was less than
   // options_.delete_obsolete_files_period_micros microseconds ago,
   // it will not fill up the deletion_state
@@ -291,7 +291,7 @@ class DBImpl : public DB {
   // concurrent flush memtables to storage.
   Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
                                      VersionEdit* edit);
-  Status WriteLevel0Table(ColumnFamilyData* cfd, std::vector<MemTable*>& mems,
+  Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
                           VersionEdit* edit, uint64_t* filenumber);
 
   uint64_t SlowdownAmount(int n, double bottom, double top);
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 71bb2e57c..b8d9038a1 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -102,7 +102,8 @@ class DBIter: public Iterator {
   virtual void SeekToLast();
 
  private:
-  void FindNextUserEntry(bool skipping);
+  inline void FindNextUserEntry(bool skipping);
+  void FindNextUserEntryInternal(bool skipping);
   void FindPrevUserEntry();
   bool ParseKey(ParsedInternalKey* key);
   void MergeValuesNewToOld();
@@ -191,7 +192,15 @@ void DBIter::Next() {
 //
 // NOTE: In between, saved_key_ can point to a user key that has
 //       a delete marker
-void DBIter::FindNextUserEntry(bool skipping) {
+inline void DBIter::FindNextUserEntry(bool skipping) {
+  StopWatchNano timer(env_, false);
+  StartPerfTimer(&timer);
+  FindNextUserEntryInternal(skipping);
+  BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+void DBIter::FindNextUserEntryInternal(bool skipping) {
   // Loop until we hit an acceptable entry to yield
   assert(iter_->Valid());
   assert(direction_ == kForward);
@@ -226,10 +235,7 @@ void DBIter::FindNextUserEntry(bool skipping) {
             valid_ = true;
             MergeValuesNewToOld();  // Go to a different state machine
             return;
-          case kTypeColumnFamilyDeletion:
-          case kTypeColumnFamilyValue:
-          case kTypeColumnFamilyMerge:
-          case kTypeLogData:
+          default:
             assert(false);
             break;
         }
@@ -429,13 +435,16 @@ void DBIter::FindPrevUserEntry() {
 }
 
 void DBIter::Seek(const Slice& target) {
-  direction_ = kForward;
-  ClearSavedValue();
   saved_key_.clear();
   AppendInternalKey(
       &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+  StopWatchNano internal_seek_timer(env_, false);
+  StartPerfTimer(&internal_seek_timer);
   iter_->Seek(saved_key_);
+  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
   if (iter_->Valid()) {
+    direction_ = kForward;
+    ClearSavedValue();
     FindNextUserEntry(false /*not skipping */);
   } else {
     valid_ = false;
@@ -445,7 +454,10 @@ void DBIter::Seek(const Slice& target) {
 void DBIter::SeekToFirst() {
   direction_ = kForward;
   ClearSavedValue();
+  StopWatchNano internal_seek_timer(env_, false);
+  StartPerfTimer(&internal_seek_timer);
   iter_->SeekToFirst();
+  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
   if (iter_->Valid()) {
     FindNextUserEntry(false /* not skipping */);
   } else {
@@ -464,7 +476,10 @@ void DBIter::SeekToLast() {
 
   direction_ = kReverse;
   ClearSavedValue();
+  StopWatchNano internal_seek_timer(env_, false);
+  StartPerfTimer(&internal_seek_timer);
   iter_->SeekToLast();
+  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
   FindPrevUserEntry();
 }
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 65b1dffd3..1edb14799 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -11,25 +11,29 @@
 #include <set>
 #include <unistd.h>
 
-#include "rocksdb/db.h"
-#include "rocksdb/filter_policy.h"
+#include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "table/block_based_table_factory.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "table/plain_table_factory.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "table/block_based_table_factory.h"
 #include "util/hash.h"
+#include "util/hash_linklist_rep.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
+#include "util/statistics.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
-#include "util/statistics.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -241,12 +245,17 @@ class SpecialEnv : public EnvWrapper {
 class DBTest {
  private:
   const FilterPolicy* filter_policy_;
+  static std::unique_ptr<const SliceTransform> prefix_1_transform;
+  static std::unique_ptr<const SliceTransform> noop_transform;
 
  protected:
   // Sequence of option configurations to try
   enum OptionConfig {
     kDefault,
+    kPlainTableFirstBytePrefix,
+    kPlainTableAllBytesPrefix,
     kVectorRep,
+    kHashLinkList,
     kMergePut,
     kFilter,
     kUncompressed,
@@ -260,6 +269,7 @@ class DBTest {
     kHashSkipList,
     kUniversalCompaction,
     kCompressedBlockCache,
+    kInfiniteMaxOpenFiles,
     kEnd
   };
   int option_config_;
@@ -277,7 +287,8 @@ class DBTest {
     kNoSkip = 0,
     kSkipDeletesFilterFirst = 1,
     kSkipUniversalCompaction = 2,
-    kSkipMergePut = 4
+    kSkipMergePut = 4,
+    kSkipPlainTable = 8
   };
 
   DBTest() : option_config_(kDefault),
@@ -299,20 +310,27 @@ class DBTest {
   // Switch to a fresh database with the next option configuration to
   // test.  Return false if there are no more configurations to test.
   bool ChangeOptions(int skip_mask = kNoSkip) {
-    option_config_++;
-
     // skip some options
-    if (skip_mask & kSkipDeletesFilterFirst &&
-        option_config_ == kDeletesFilterFirst) {
-      option_config_++;
-    }
-    if (skip_mask & kSkipUniversalCompaction &&
-        option_config_ == kUniversalCompaction) {
-      option_config_++;
-    }
-    if (skip_mask & kSkipMergePut && option_config_ == kMergePut) {
-      option_config_++;
+    for(option_config_++; option_config_ < kEnd; option_config_++) {
+      if ((skip_mask & kSkipDeletesFilterFirst) &&
+          option_config_ == kDeletesFilterFirst) {
+        continue;
+      }
+      if ((skip_mask & kSkipUniversalCompaction) &&
+          option_config_ == kUniversalCompaction) {
+        continue;
+      }
+      if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
+        continue;
+      }
+      if ((skip_mask & kSkipPlainTable)
+          && (option_config_ == kPlainTableAllBytesPrefix
+              || option_config_ == kPlainTableFirstBytePrefix)) {
+        continue;
+      }
+      break;
     }
+
     if (option_config_ >= kEnd) {
       Destroy(&last_options_);
       return false;
@@ -345,6 +363,18 @@ class DBTest {
         options.memtable_factory.reset(
             NewHashSkipListRepFactory(NewFixedPrefixTransform(1)));
         break;
+      case kPlainTableFirstBytePrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor = prefix_1_transform.get();
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        break;
+      case kPlainTableAllBytesPrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor = noop_transform.get();
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        break;
       case kMergePut:
         options.merge_operator = MergeOperators::CreatePutOperator();
         break;
@@ -380,12 +410,19 @@ class DBTest {
       case kVectorRep:
         options.memtable_factory.reset(new VectorRepFactory(100));
         break;
+      case kHashLinkList:
+      options.memtable_factory.reset(
+          NewHashLinkListRepFactory(NewFixedPrefixTransform(1), 4));
+        break;
       case kUniversalCompaction:
         options.compaction_style = kCompactionStyleUniversal;
         break;
       case kCompressedBlockCache:
         options.block_cache_compressed = NewLRUCache(8*1024*1024);
         break;
+      case kInfiniteMaxOpenFiles:
+        options.max_open_files = -1;
+        break;
       default:
         break;
     }
@@ -526,10 +563,7 @@ class DBTest {
             case kTypeDeletion:
               result += "DEL";
               break;
-            case kTypeColumnFamilyDeletion:
-            case kTypeColumnFamilyValue:
-            case kTypeColumnFamilyMerge:
-            case kTypeLogData:
+            default:
               assert(false);
               break;
           }
@@ -680,6 +714,72 @@ class DBTest {
     delete iter;
   }
 
+  // Used to test InplaceUpdate
+
+  // If previous value is nullptr or delta is > than previous value,
+  //   sets newValue with delta
+  // If previous value is not empty,
+  //   updates previous value with 'b' string of previous value size - 1.
+  static UpdateStatus
+      updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize,
+                               Slice delta, std::string* newValue) {
+    if (prevValue == nullptr) {
+      *newValue = std::string(delta.size(), 'c');
+      return UpdateStatus::UPDATED;
+    } else {
+      *prevSize = *prevSize - 1;
+      std::string str_b = std::string(*prevSize, 'b');
+      memcpy(prevValue, str_b.c_str(), str_b.size());
+      return UpdateStatus::UPDATED_INPLACE;
+    }
+  }
+
+  static UpdateStatus
+      updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize,
+                                     Slice delta, std::string* newValue) {
+    if (prevValue == nullptr) {
+      *newValue = std::string(delta.size(), 'c');
+      return UpdateStatus::UPDATED;
+    } else {
+      *prevSize = 1;
+      std::string str_b = std::string(*prevSize, 'b');
+      memcpy(prevValue, str_b.c_str(), str_b.size());
+      return UpdateStatus::UPDATED_INPLACE;
+    }
+  }
+
+  static UpdateStatus
+      updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize,
+                              Slice delta, std::string* newValue) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  }
+
+  static UpdateStatus
+      updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+                            Slice delta, std::string* newValue) {
+    return UpdateStatus::UPDATE_FAILED;
+  }
+
+  // Utility method to test InplaceUpdate
+  void validateNumberOfEntries(int numValues) {
+      Iterator* iter = dbfull()->TEST_NewInternalIterator();
+      iter->SeekToFirst();
+      ASSERT_EQ(iter->status().ok(), true);
+      int seq = numValues;
+      while (iter->Valid()) {
+        ParsedInternalKey ikey;
+        ikey.sequence = -1;
+        ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+
+        // checks sequence number for updates
+        ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+        iter->Next();
+      }
+      delete iter;
+      ASSERT_EQ(0, seq);
+  }
+
   void CopyFile(const std::string& source, const std::string& destination,
                 uint64_t size = 0) {
     const EnvOptions soptions;
@@ -705,6 +805,10 @@ class DBTest {
   }
 
 };
+std::unique_ptr<const SliceTransform> DBTest::prefix_1_transform(
+    NewFixedPrefixTransform(1));
+std::unique_ptr<const SliceTransform> DBTest::noop_transform(
+    NewNoopTransform());
 
 static std::string Key(int i) {
   char buf[100];
@@ -718,19 +822,19 @@ static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
 
 TEST(DBTest, Empty) {
   do {
-    ASSERT_TRUE(db_ != nullptr);
-    ASSERT_EQ("NOT_FOUND", Get("foo"));
-  } while (ChangeOptions());
-}
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    Reopen(&options);
 
-TEST(DBTest, ReadWrite) {
-  do {
     ASSERT_OK(Put("foo", "v1"));
     ASSERT_EQ("v1", Get("foo"));
-    ASSERT_OK(Put("bar", "v2"));
-    ASSERT_OK(Put("foo", "v3"));
-    ASSERT_EQ("v3", Get("foo"));
-    ASSERT_EQ("v2", Get("bar"));
+
+    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
+    Put("k1", std::string(100000, 'x'));             // Fill memtable
+    Put("k2", std::string(100000, 'y'));             // Trigger compaction
+    ASSERT_EQ("v1", Get("foo"));
+    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
   } while (ChangeOptions());
 }
 
@@ -769,7 +873,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
 
   ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
   // Create a new talbe.
-  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
 
   // index/filter blocks added to block cache right after table creation.
   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
@@ -1051,7 +1155,10 @@ TEST(DBTest, KeyMayExist) {
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     delete options.filter_policy;
-  } while (ChangeOptions());
+
+    // KeyMayExist function only checks data in block caches, which is not used
+    // by plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
 }
 
 TEST(DBTest, NonBlockingIteration) {
@@ -1111,7 +1218,9 @@ TEST(DBTest, NonBlockingIteration) {
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
     delete iter;
 
-  } while (ChangeOptions());
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+  } while (ChangeOptions(kSkipPlainTable));
 }
 
 // A delete is skipped for key if KeyMayExist(key) returns False
@@ -1250,7 +1359,13 @@ TEST(DBTest, IterMulti) {
     ASSERT_EQ(IterStatus(iter), "a->va");
     iter->Seek("ax");
     ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    SetPerfLevel(kEnableTime);
+    perf_context.Reset();
     iter->Seek("b");
+    ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0);
+    ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
+    SetPerfLevel(kDisable);
     ASSERT_EQ(IterStatus(iter), "b->vb");
     iter->Seek("z");
     ASSERT_EQ(IterStatus(iter), "(invalid)");
@@ -1265,7 +1380,12 @@ TEST(DBTest, IterMulti) {
     // Switch from forward to reverse
     iter->SeekToFirst();
     iter->Next();
+    SetPerfLevel(kEnableTime);
+    perf_context.Reset();
     iter->Next();
+    ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time);
+    ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
+    SetPerfLevel(kDisable);
     iter->Prev();
     ASSERT_EQ(IterStatus(iter), "b->vb");
 
@@ -1696,22 +1816,42 @@ TEST(DBTest, NumImmutableMemTable) {
 
     std::string big_value(1000000, 'x');
     std::string num;
+    SetPerfLevel(kEnableTime);;
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "0");
+    perf_context.Reset();
+    Get("k1");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "1");
+    perf_context.Reset();
+    Get("k1");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get("k2");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "2");
+    perf_context.Reset();
+    Get("k2");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get("k3");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get("k1");
+    ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
 
     dbfull()->Flush(FlushOptions());
     ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
     ASSERT_EQ(num, "0");
+    SetPerfLevel(kDisable);
   } while (ChangeCompactOptions());
 }
 
@@ -1720,11 +1860,16 @@ TEST(DBTest, FLUSH) {
     Options options = CurrentOptions();
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
+    SetPerfLevel(kEnableTime);;
     ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
     // this will now also flush the last 2 writes
     dbfull()->Flush(FlushOptions());
     ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
 
+    perf_context.Reset();
+    Get("foo");
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
+
     Reopen();
     ASSERT_EQ("v1", Get("foo"));
     ASSERT_EQ("v1", Get("bar"));
@@ -1736,7 +1881,9 @@ TEST(DBTest, FLUSH) {
 
     Reopen();
     ASSERT_EQ("v2", Get("bar"));
+    perf_context.Reset();
     ASSERT_EQ("v2", Get("foo"));
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
 
     writeOpt.disableWAL = false;
     ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
@@ -1748,6 +1895,8 @@ TEST(DBTest, FLUSH) {
     // has WAL enabled.
     ASSERT_EQ("v3", Get("foo"));
     ASSERT_EQ("v3", Get("bar"));
+
+    SetPerfLevel(kDisable);
   } while (ChangeCompactOptions());
 }
 
@@ -2559,9 +2708,9 @@ TEST(DBTest, InPlaceUpdate) {
     options.inplace_update_support = true;
     options.env = env_;
     options.write_buffer_size = 100000;
+    Reopen(&options);
 
     // Update key with values of smaller size
-    Reopen(&options);
     int numValues = 10;
     for (int i = numValues; i > 0; i--) {
       std::string value = DummyString(i, 'a');
@@ -2569,50 +2718,133 @@ TEST(DBTest, InPlaceUpdate) {
       ASSERT_EQ(value, Get("key"));
     }
 
-    int count = 0;
-    Iterator* iter = dbfull()->TEST_NewInternalIterator();
-    iter->SeekToFirst();
-    ASSERT_EQ(iter->status().ok(), true);
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      count++;
-      // All updates with the same sequence number.
-      ASSERT_EQ(ikey.sequence, (unsigned)1);
-      iter->Next();
-    }
     // Only 1 instance for that key.
-    ASSERT_EQ(count, 1);
-    delete iter;
+    validateNumberOfEntries(1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    Reopen(&options);
 
     // Update key with values of larger size
-    DestroyAndReopen(&options);
-    numValues = 10;
+    int numValues = 10;
     for (int i = 0; i < numValues; i++) {
       std::string value = DummyString(i, 'a');
       ASSERT_OK(Put("key", value));
       ASSERT_EQ(value, Get("key"));
     }
 
-    count = 0;
-    iter = dbfull()->TEST_NewInternalIterator();
-    iter->SeekToFirst();
-    ASSERT_EQ(iter->status().ok(), true);
-    int seq = numValues;
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      count++;
-      // No inplace updates. All updates are puts with new seq number
-      ASSERT_EQ(ikey.sequence, (unsigned)seq--);
-      iter->Next();
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues);
+
+  } while (ChangeCompactOptions());
+}
+
+
+TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceSmallerSize;
+    Reopen(&options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    ASSERT_OK(Put("key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get("key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put("key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i - 1, 'b'), Get("key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceSmallerVarintSize;
+    Reopen(&options);
+
+    // Update key with values of smaller varint size
+    int numValues = 265;
+    ASSERT_OK(Put("key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get("key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put("key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(1, 'b'), Get("key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceLargerSize;
+    Reopen(&options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      ASSERT_OK(Put("key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i, 'c'), Get("key"));
     }
+
+    // No inplace updates. All updates are puts with new seq number
     // All 10 updates exist in the internal iterator
-    ASSERT_EQ(count, numValues);
-    delete iter;
+    validateNumberOfEntries(numValues);
 
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackNoAction) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceNoAction;
+    Reopen(&options);
+
+    // Callback function requests no actions from db
+    ASSERT_OK(Put("key", DummyString(1, 'a')));
+    ASSERT_EQ(Get("key"), "NOT_FOUND");
 
   } while (ChangeCompactOptions());
 }
@@ -2653,9 +2885,7 @@ class DeleteFilter : public CompactionFilter {
 
 class ChangeFilter : public CompactionFilter {
  public:
-  explicit ChangeFilter(int argv) {
-    assert(argv == 100);
-  }
+  explicit ChangeFilter() {}
 
   virtual bool Filter(int level, const Slice& key,
                       const Slice& value, std::string* new_value,
@@ -2697,19 +2927,16 @@ class DeleteFilterFactory : public CompactionFilterFactory {
 
 class ChangeFilterFactory : public CompactionFilterFactory {
   public:
-    explicit ChangeFilterFactory(int argv) : argv_(argv) {}
+    explicit ChangeFilterFactory() {}
 
     virtual std::unique_ptr<CompactionFilter>
     CreateCompactionFilter(const CompactionFilter::Context& context) override {
-      return std::unique_ptr<CompactionFilter>(new ChangeFilter(argv_));
+      return std::unique_ptr<CompactionFilter>(new ChangeFilter());
     }
 
     virtual const char* Name() const override {
       return "ChangeFilterFactory";
     }
-
-  private:
-    const int argv_;
 };
 
 TEST(DBTest, CompactionFilter) {
@@ -2856,7 +3083,7 @@ TEST(DBTest, CompactionFilterWithValueChange) {
     options.num_levels = 3;
     options.max_mem_compaction_level = 0;
     options.compaction_filter_factory =
-      std::make_shared<ChangeFilterFactory>(100);
+      std::make_shared<ChangeFilterFactory>();
     Reopen(&options);
 
     // Write 100K+1 keys, these are written to a few files
@@ -3000,7 +3227,8 @@ TEST(DBTest, ApproximateSizes) {
       ASSERT_EQ(NumTableFilesAtLevel(0), 0);
       ASSERT_GT(NumTableFilesAtLevel(1), 0);
     }
-  } while (ChangeOptions(kSkipUniversalCompaction));
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
 }
 
 TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
@@ -3038,7 +3266,8 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
 
       dbfull()->TEST_CompactRange(0, nullptr, nullptr);
     }
-  } while (ChangeOptions());
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
 }
 
 TEST(DBTest, IteratorPinsRef) {
@@ -3122,7 +3351,9 @@ TEST(DBTest, HiddenValuesAreRemoved) {
     ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
 
     ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
-  } while (ChangeOptions(kSkipUniversalCompaction));
+    // ApproximateOffsetOf() is not yet implemented in plain table format,
+    // which is used by Size().
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
 }
 
 TEST(DBTest, CompactBetweenSnapshots) {
@@ -4790,7 +5021,9 @@ TEST(DBTest, Randomized) {
       // TODO(sanjay): Test Get() works
       int p = rnd.Uniform(100);
       int minimum = 0;
-      if (option_config_ == kHashSkipList) {
+      if (option_config_ == kHashSkipList ||
+          option_config_ == kHashLinkList ||
+          option_config_ == kPlainTableFirstBytePrefix) {
         minimum = 1;
       }
       if (p < 45) {                               // Put
@@ -4969,20 +5202,22 @@ TEST(DBTest, PrefixScan) {
   snprintf(buf, sizeof(buf), "03______:");
   prefix = Slice(buf, 8);
   key = Slice(buf, 9);
-  auto prefix_extractor = NewFixedPrefixTransform(8);
   // db configs
   env_->count_random_reads_ = true;
   Options options = CurrentOptions();
   options.env = env_;
   options.no_block_cache = true;
-  options.filter_policy =  NewBloomFilterPolicy(10);
-  options.prefix_extractor = prefix_extractor;
+  options.filter_policy = NewBloomFilterPolicy(10);
+  options.prefix_extractor = NewFixedPrefixTransform(8);
   options.whole_key_filtering = false;
   options.disable_auto_compactions = true;
   options.max_background_compactions = 2;
   options.create_if_missing = true;
   options.disable_seek_compaction = true;
-  options.memtable_factory.reset(NewHashSkipListRepFactory(prefix_extractor));
+  // Tricky: options.prefix_extractor will be released by
+  // NewHashSkipListRepFactory after use.
+  options.memtable_factory.reset(
+      NewHashSkipListRepFactory(options.prefix_extractor));
 
   // prefix specified, with blooms: 2 RAND I/Os
   // SeekToFirst
diff --git a/db/dbformat.cc b/db/dbformat.cc
index 3d7e61010..43560bc83 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -6,9 +6,9 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
 
 #include <stdio.h>
-#include "db/dbformat.h"
 #include "port/port.h"
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
@@ -72,6 +72,28 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
   return r;
 }
 
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+                                   const ParsedInternalKey& b) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(a.user_key, b.user_key);
+  BumpPerfCount(&perf_context.user_key_comparison_count);
+  if (r == 0) {
+    if (a.sequence > b.sequence) {
+      r = -1;
+    } else if (a.sequence < b.sequence) {
+      r = +1;
+    } else if (a.type > b.type) {
+      r = -1;
+    } else if (a.type < b.type) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
 void InternalKeyComparator::FindShortestSeparator(
       std::string* start,
       const Slice& limit) const {
diff --git a/db/dbformat.h b/db/dbformat.h
index 82031cf5c..be46d14a1 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -25,7 +25,9 @@ class InternalKey;
 // Value types encoded as the last component of internal keys.
 // DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
 // data structures.
-enum ValueType {
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
   kTypeDeletion = 0x0,
   kTypeValue = 0x1,
   kTypeMerge = 0x2,
@@ -33,7 +35,9 @@ enum ValueType {
   kTypeColumnFamilyDeletion = 0x4,
   kTypeColumnFamilyValue = 0x5,
   kTypeColumnFamilyMerge = 0x6,
+  kMaxValue = 0x7F
 };
+
 // kValueTypeForSeek defines the ValueType that should be passed when
 // constructing a ParsedInternalKey object for seeking to a particular
 // sequence number (since we sort sequence numbers in decreasing order
@@ -99,6 +103,7 @@ class InternalKeyComparator : public Comparator {
     name_("rocksdb.InternalKeyComparator:" +
           std::string(user_comparator_->Name())) {
   }
+  virtual ~InternalKeyComparator() {}
 
   virtual const char* Name() const;
   virtual int Compare(const Slice& a, const Slice& b) const;
@@ -110,6 +115,7 @@ class InternalKeyComparator : public Comparator {
   const Comparator* user_comparator() const { return user_comparator_; }
 
   int Compare(const InternalKey& a, const InternalKey& b) const;
+  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
 };
 
 // Filter policy wrapper that converts from internal keys to user keys
@@ -166,6 +172,7 @@ inline bool ParseInternalKey(const Slice& internal_key,
   unsigned char c = num & 0xff;
   result->sequence = num >> 8;
   result->type = static_cast<ValueType>(c);
+  assert(result->type <= ValueType::kMaxValue);
   result->user_key = Slice(internal_key.data(), n - 8);
   return (c <= static_cast<unsigned char>(kValueTypeForSeek));
 }
diff --git a/db/log_format.h b/db/log_format.h
index 10a31ba27..919c087e2 100644
--- a/db/log_format.h
+++ b/db/log_format.h
@@ -17,7 +17,6 @@ namespace log {
 enum RecordType {
   // Zero is reserved for preallocated files
   kZeroType = 0,
-
   kFullType = 1,
 
   // For fragments
diff --git a/db/memtable.cc b/db/memtable.cc
index 1616a1227..2f84a289e 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -17,10 +17,14 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "util/arena.h"
 #include "util/coding.h"
-#include "util/mutexlock.h"
 #include "util/murmurhash.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
 #include "util/statistics.h"
+#include "util/stop_watch.h"
 
 namespace std {
 template <>
@@ -37,9 +41,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
                    const ColumnFamilyOptions& options)
     : comparator_(cmp),
       refs_(0),
-      arena_impl_(options.arena_block_size),
-      table_(options.memtable_factory->CreateMemTableRep(comparator_,
-                                                         &arena_impl_)),
+      arena_(options.arena_block_size),
+      table_(options.memtable_factory->CreateMemTableRep(comparator_, &arena_)),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
@@ -47,23 +50,36 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       mem_next_logfile_number_(0),
       mem_logfile_number_(0),
       locks_(options.inplace_update_support ? options.inplace_update_num_locks
-                                            : 0) {}
+                                            : 0),
+      prefix_extractor_(options.prefix_extractor) {
+  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
+    prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
+                                         options.memtable_prefix_bloom_probes));
+  }
+}
 
 MemTable::~MemTable() {
   assert(refs_ == 0);
 }
 
 size_t MemTable::ApproximateMemoryUsage() {
-  return arena_impl_.ApproximateMemoryUsage() +
-         table_->ApproximateMemoryUsage();
+  return arena_.ApproximateMemoryUsage() + table_->ApproximateMemoryUsage();
 }
 
-int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+                                        const char* prefix_len_key2) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+  return comparator.Compare(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key,
+                                        const Slice& key)
     const {
   // Internal keys are encoded as length-prefixed strings.
-  Slice a = GetLengthPrefixedSlice(aptr);
-  Slice b = GetLengthPrefixedSlice(bptr);
-  return comparator.Compare(a, b);
+  Slice a = GetLengthPrefixedSlice(prefix_len_key);
+  return comparator.Compare(a, key);
 }
 
 Slice MemTableRep::UserKey(const char* key) const {
@@ -74,7 +90,7 @@ Slice MemTableRep::UserKey(const char* key) const {
 // Encode a suitable internal key target for "target" and return it.
 // Uses *scratch as scratch space, and the returned pointer will point
 // into this scratch space.
-static const char* EncodeKey(std::string* scratch, const Slice& target) {
+const char* EncodeKey(std::string* scratch, const Slice& target) {
   scratch->clear();
   PutVarint32(scratch, target.size());
   scratch->append(target.data(), target.size());
@@ -83,27 +99,53 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
 
 class MemTableIterator: public Iterator {
  public:
-  MemTableIterator(MemTableRep* table, const ReadOptions& options)
-    : iter_() {
+  MemTableIterator(const MemTable& mem, const ReadOptions& options)
+      : mem_(mem), iter_(), dynamic_prefix_seek_(false), valid_(false) {
     if (options.prefix) {
-      iter_.reset(table->GetPrefixIterator(*options.prefix));
+      iter_.reset(mem_.table_->GetPrefixIterator(*options.prefix));
     } else if (options.prefix_seek) {
-      iter_.reset(table->GetDynamicPrefixIterator());
+      dynamic_prefix_seek_ = true;
+      iter_.reset(mem_.table_->GetDynamicPrefixIterator());
     } else {
-      iter_.reset(table->GetIterator());
+      iter_.reset(mem_.table_->GetIterator());
     }
   }
 
-  virtual bool Valid() const { return iter_->Valid(); }
-  virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void SeekToLast() { iter_->SeekToLast(); }
-  virtual void Next() { iter_->Next(); }
-  virtual void Prev() { iter_->Prev(); }
+  virtual bool Valid() const { return valid_; }
+  virtual void Seek(const Slice& k) {
+    if (dynamic_prefix_seek_ && mem_.prefix_bloom_ &&
+        !mem_.prefix_bloom_->MayContain(
+          mem_.prefix_extractor_->Transform(ExtractUserKey(k)))) {
+      valid_ = false;
+      return;
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToFirst() {
+    iter_->SeekToFirst();
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToLast() {
+    iter_->SeekToLast();
+    valid_ = iter_->Valid();
+  }
+  virtual void Next() {
+    assert(Valid());
+    iter_->Next();
+    valid_ = iter_->Valid();
+  }
+  virtual void Prev() {
+    assert(Valid());
+    iter_->Prev();
+    valid_ = iter_->Valid();
+  }
   virtual Slice key() const {
+    assert(Valid());
     return GetLengthPrefixedSlice(iter_->key());
   }
   virtual Slice value() const {
+    assert(Valid());
     Slice key_slice = GetLengthPrefixedSlice(iter_->key());
     return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
   }
@@ -111,8 +153,10 @@ class MemTableIterator: public Iterator {
   virtual Status status() const { return Status::OK(); }
 
  private:
-  std::unique_ptr<MemTableRep::Iterator> iter_;
-  std::string tmp_;       // For passing to EncodeKey
+  const MemTable& mem_;
+  std::shared_ptr<MemTableRep::Iterator> iter_;
+  bool dynamic_prefix_seek_;
+  bool valid_;
 
   // No copying allowed
   MemTableIterator(const MemTableIterator&);
@@ -120,7 +164,7 @@ class MemTableIterator: public Iterator {
 };
 
 Iterator* MemTable::NewIterator(const ReadOptions& options) {
-  return new MemTableIterator(table_.get(), options);
+  return new MemTableIterator(*this, options);
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
@@ -128,7 +172,7 @@ port::RWMutex* MemTable::GetLock(const Slice& key) {
 }
 
 void MemTable::Add(SequenceNumber s, ValueType type,
-                   const Slice& key,
+                   const Slice& key, /* user key */
                    const Slice& value) {
   // Format of an entry is concatenation of:
   //  key_size     : varint32 of internal_key.size()
@@ -141,7 +185,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   const size_t encoded_len =
       VarintLength(internal_key_size) + internal_key_size +
       VarintLength(val_size) + val_size;
-  char* buf = arena_impl_.Allocate(encoded_len);
+  char* buf = arena_.Allocate(encoded_len);
   char* p = EncodeVarint32(buf, internal_key_size);
   memcpy(p, key.data(), key_size);
   p += key_size;
@@ -152,6 +196,11 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
   table_->Insert(buf);
 
+  if (prefix_bloom_) {
+    assert(prefix_extractor_);
+    prefix_bloom_->Add(prefix_extractor_->Transform(key));
+  }
+
   // The first sequence number inserted into the memtable
   assert(first_seqno_ == 0 || s > first_seqno_);
   if (first_seqno_ == 0) {
@@ -161,17 +210,28 @@ void MemTable::Add(SequenceNumber s, ValueType type,
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext& merge_context, const Options& options) {
-  Slice memkey = key.memtable_key();
-  std::unique_ptr<MemTableRep::Iterator> iter(
-      table_->GetIterator(key.user_key()));
-  iter->Seek(memkey.data());
+  StopWatchNano memtable_get_timer(options.env, false);
+  StartPerfTimer(&memtable_get_timer);
+
+  Slice mem_key = key.memtable_key();
+  Slice user_key = key.user_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter;
+  if (prefix_bloom_ &&
+      !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
+    // iter is null if prefix bloom says the key does not exist
+  } else {
+    iter.reset(table_->GetIterator(user_key));
+    iter->Seek(key.internal_key(), mem_key.data());
+  }
 
   bool merge_in_progress = s->IsMergeInProgress();
   auto merge_operator = options.merge_operator.get();
   auto logger = options.info_log;
   std::string merge_result;
 
-  for (; iter->Valid(); iter->Next()) {
+  bool found_final_value = false;
+  for (; !found_final_value && iter && iter->Valid(); iter->Next()) {
     // entry format is:
     //    klength  varint32
     //    userkey  char[klength-8]
@@ -182,7 +242,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     // sequence number since the Seek() call above should have skipped
     // all entries with overly large sequence numbers.
     const char* entry = iter->key();
-    uint32_t key_length;
+    uint32_t key_length = 0;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
     if (comparator_.comparator.user_comparator()->Compare(
         Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
@@ -209,7 +269,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           if (options.inplace_update_support) {
             GetLock(key.user_key())->Unlock();
           }
-          return true;
+          found_final_value = true;
+          break;
         }
         case kTypeDeletion: {
           if (merge_in_progress) {
@@ -224,7 +285,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           } else {
             *s = Status::NotFound();
           }
-          return true;
+          found_final_value = true;
+          break;
         }
         case kTypeMerge: {
           Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
@@ -244,10 +306,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           }
           break;
         }
-        case kTypeColumnFamilyDeletion:
-        case kTypeColumnFamilyValue:
-        case kTypeColumnFamilyMerge:
-        case kTypeLogData:
+        default:
           assert(false);
           break;
       }
@@ -259,25 +318,27 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
 
   // No change to value, since we have not yet found a Put/Delete
 
-  if (merge_in_progress) {
+  if (!found_final_value && merge_in_progress) {
     *s = Status::MergeInProgress("");
   }
-  return false;
+  BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
+  BumpPerfCount(&perf_context.get_from_memtable_count);
+  return found_final_value;
 }
 
-bool MemTable::Update(SequenceNumber seq, ValueType type,
+void MemTable::Update(SequenceNumber seq,
                       const Slice& key,
                       const Slice& value) {
   LookupKey lkey(key, seq);
-  Slice memkey = lkey.memtable_key();
+  Slice mem_key = lkey.memtable_key();
 
   std::unique_ptr<MemTableRep::Iterator> iter(
-      table_->GetIterator(lkey.user_key()));
-  iter->Seek(memkey.data());
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(lkey.internal_key(), mem_key.data());
 
   if (iter->Valid()) {
     // entry format is:
-    //    klength  varint32
+    //    key_length  varint32
     //    userkey  char[klength-8]
     //    tag      uint64
     //    vlength  varint32
@@ -286,7 +347,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
     // sequence number since the Seek() call above should have skipped
     // all entries with overly large sequence numbers.
     const char* entry = iter->key();
-    uint32_t key_length;
+    uint32_t key_length = 0;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
     if (comparator_.comparator.user_comparator()->Compare(
         Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
@@ -294,32 +355,105 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
       const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
-          uint32_t vlength;
-          GetVarint32Ptr(key_ptr + key_length,
-                         key_ptr + key_length+5, &vlength);
-          // Update value, if newValue size  <= curValue size
-          if (value.size() <= vlength) {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t prev_size = prev_value.size();
+          uint32_t new_size = value.size();
+
+          // Update value, if new value size  <= previous value size
+          if (new_size <= prev_size ) {
             char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
-                                     value.size());
+                                     new_size);
             WriteLock wl(GetLock(lkey.user_key()));
             memcpy(p, value.data(), value.size());
             assert((unsigned)((p + value.size()) - entry) ==
                    (unsigned)(VarintLength(key_length) + key_length +
                               VarintLength(value.size()) + value.size()));
-            return true;
+            return;
           }
         }
         default:
           // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
-          // then we probably don't have enough space to update in-place
-          // Maybe do something later
-          // Return false, and do normal Add()
-          return false;
+          // we don't have enough space for update inplace
+            Add(seq, kTypeValue, key, value);
+            return;
       }
     }
   }
 
-  // Key doesn't exist
+  // key doesn't exist
+  Add(seq, kTypeValue, key, value);
+}
+
+bool MemTable::UpdateCallback(SequenceNumber seq,
+                              const Slice& key,
+                              const Slice& delta,
+                              const Options& options) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(lkey.internal_key(), memkey.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t  prev_size = prev_value.size();
+
+          char* prev_buffer = const_cast<char*>(prev_value.data());
+          uint32_t  new_prev_size = prev_size;
+
+          std::string str_value;
+          WriteLock wl(GetLock(lkey.user_key()));
+          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
+                                                    delta, &str_value);
+          if (status == UpdateStatus::UPDATED_INPLACE) {
+            // Value already updated by callback.
+            assert(new_prev_size <= prev_size);
+            if (new_prev_size < prev_size) {
+              // overwrite the new prev_size
+              char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                       new_prev_size);
+              if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+                // shift the value buffer as well.
+                memcpy(p, prev_buffer, new_prev_size);
+              }
+            }
+            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
+            return true;
+          } else if (status == UpdateStatus::UPDATED) {
+            Add(seq, kTypeValue, key, Slice(str_value));
+            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
+            return true;
+          } else if (status == UpdateStatus::UPDATE_FAILED) {
+            // No action required. Return.
+            return true;
+          }
+        }
+        default:
+          break;
+      }
+    }
+  }
+  // If the latest value is not kTypeValue
+  // or key doesn't exist
   return false;
 }
 
@@ -331,13 +465,13 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
   // The iterator only needs to be ordered within the same user key.
   std::unique_ptr<MemTableRep::Iterator> iter(
       table_->GetIterator(key.user_key()));
-  iter->Seek(memkey.data());
+  iter->Seek(key.internal_key(), memkey.data());
 
   size_t num_successive_merges = 0;
 
   for (; iter->Valid(); iter->Next()) {
     const char* entry = iter->key();
-    uint32_t key_length;
+    uint32_t key_length = 0;
     const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
     if (!comparator_.comparator.user_comparator()->Compare(
         Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
diff --git a/db/memtable.h b/db/memtable.h
index 415c7070b..61bebaee0 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -16,7 +16,8 @@
 #include "db/version_edit.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
-#include "util/arena_impl.h"
+#include "util/arena.h"
+#include "util/dynamic_bloom.h"
 
 namespace rocksdb {
 
@@ -29,7 +30,10 @@ class MemTable {
   struct KeyComparator : public MemTableRep::KeyComparator {
     const InternalKeyComparator comparator;
     explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
-    virtual int operator()(const char* a, const char* b) const;
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const;
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const override;
   };
 
   // MemTables are reference counted.  The initial reference count
@@ -94,16 +98,31 @@ class MemTable {
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext& merge_context, const Options& options);
 
-  // Update the value and return status ok,
-  //   if key exists in current memtable
-  //     if new sizeof(new_value) <= sizeof(old_value) &&
-  //       old_value for that key is a put i.e. kTypeValue
-  //     else return false, and status - NotUpdatable()
-  //   else return false, and status - NotFound()
-  bool Update(SequenceNumber seq, ValueType type,
+  // Attempts to update the new_value inplace, else does normal Add
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     if new sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else add(key, new_value)
+  void Update(SequenceNumber seq,
               const Slice& key,
               const Slice& value);
 
+  // If prev_value for key exits, attempts to update it inplace.
+  // else returns false
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     new_value = delta(prev_value)
+  //     if sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else return false
+  bool UpdateCallback(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& delta,
+                      const Options& options);
+
   // Returns the number of successive merge entries starting from the newest
   // entry for the key up to the last non-merge entry or last entry for the
   // key in the memtable.
@@ -142,7 +161,7 @@ class MemTable {
 
   KeyComparator comparator_;
   int refs_;
-  ArenaImpl arena_impl_;
+  Arena arena_;
   unique_ptr<MemTableRep> table_;
 
   // These are used to manage memtable flushes to storage
@@ -150,7 +169,7 @@ class MemTable {
   bool flush_completed_;   // finished the flush
   uint64_t file_number_;    // filled up after flush is complete
 
-  // The udpates to be applied to the transaction log when this
+  // The updates to be applied to the transaction log when this
   // memtable is flushed to storage.
   VersionEdit edit_;
 
@@ -173,6 +192,11 @@ class MemTable {
 
   // Get the lock associated for the key
   port::RWMutex* GetLock(const Slice& key);
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> prefix_bloom_;
 };
 
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
 }  // namespace rocksdb
diff --git a/db/memtablelist.cc b/db/memtable_list.cc
similarity index 94%
rename from db/memtablelist.cc
rename to db/memtable_list.cc
index cbfb7c85e..240edde15 100644
--- a/db/memtablelist.cc
+++ b/db/memtable_list.cc
@@ -3,7 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
-#include "db/memtablelist.h"
+#include "db/memtable_list.h"
 
 #include <string>
 #include "rocksdb/db.h"
@@ -31,7 +31,7 @@ MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
 
 void MemTableListVersion::Ref() { ++refs_; }
 
-void MemTableListVersion::Unref(std::vector<MemTable*>* to_delete) {
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
   assert(refs_ >= 1);
   --refs_;
   if (refs_ == 0) {
@@ -103,7 +103,7 @@ bool MemTableList::IsFlushPending() {
 }
 
 // Returns the memtables that need to be flushed.
-void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
+void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
   const auto& memlist = current_->memlist_;
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
@@ -113,18 +113,18 @@ void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
       if (num_flush_not_started_ == 0) {
         imm_flush_needed.Release_Store(nullptr);
       }
-      m->flush_in_progress_ = true; // flushing will start very soon
+      m->flush_in_progress_ = true;  // flushing will start very soon
       ret->push_back(m);
     }
   }
-  flush_requested_ = false; // start-flush request is complete
+  flush_requested_ = false;  // start-flush request is complete
 }
 
 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
-    ColumnFamilyData* cfd, const std::vector<MemTable*>& mems, VersionSet* vset,
+    ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
     Status flushStatus, port::Mutex* mu, Logger* info_log, uint64_t file_number,
-    std::set<uint64_t>& pending_outputs, std::vector<MemTable*>* to_delete,
+    std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
     Directory* db_directory) {
   mu->AssertHeld();
 
diff --git a/db/memtablelist.h b/db/memtable_list.h
similarity index 90%
rename from db/memtablelist.h
rename to db/memtable_list.h
index d4fee3afd..9ade48798 100644
--- a/db/memtablelist.h
+++ b/db/memtable_list.h
@@ -3,18 +3,25 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
-
 #pragma once
+
 #include <string>
 #include <list>
 #include <vector>
 #include <set>
+#include <deque>
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/iterator.h"
+
 #include "db/dbformat.h"
+#include "db/memtable.h"
 #include "db/skiplist.h"
 #include "db/memtable.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -30,7 +37,7 @@ class MemTableListVersion {
   explicit MemTableListVersion(MemTableListVersion* old = nullptr);
 
   void Ref();
-  void Unref(std::vector<MemTable*>* to_delete = nullptr);
+  void Unref(autovector<MemTable*>* to_delete = nullptr);
 
   int size() const;
 
@@ -89,14 +96,14 @@ class MemTableList {
 
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
-  void PickMemtablesToFlush(std::vector<MemTable*>* mems);
+  void PickMemtablesToFlush(autovector<MemTable*>* mems);
 
   // Commit a successful flush in the manifest file
   Status InstallMemtableFlushResults(
-      ColumnFamilyData* cfd, const std::vector<MemTable*>& m, VersionSet* vset,
+      ColumnFamilyData* cfd, const autovector<MemTable*>& m, VersionSet* vset,
       Status flushStatus, port::Mutex* mu, Logger* info_log,
       uint64_t file_number, std::set<uint64_t>& pending_outputs,
-      std::vector<MemTable*>* to_delete, Directory* db_directory);
+      autovector<MemTable*>* to_delete, Directory* db_directory);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 0934de0cd..472cc719a 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -174,6 +174,13 @@ void ProfileKeyComparison() {
 
   HistogramImpl hist_put;
   HistogramImpl hist_get;
+  HistogramImpl hist_get_snapshot;
+  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_post_process;
+  HistogramImpl hist_num_memtable_checked;
+  HistogramImpl hist_write_pre_post;
+  HistogramImpl hist_write_wal_time;
+  HistogramImpl hist_write_memtable_time;
 
   std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
 
@@ -192,16 +199,37 @@ void ProfileKeyComparison() {
 
     perf_context.Reset();
     db->Put(write_options, key, value);
+    hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
+    hist_write_wal_time.Add(perf_context.write_wal_time);
+    hist_write_memtable_time.Add(perf_context.write_memtable_time);
     hist_put.Add(perf_context.user_key_comparison_count);
 
     perf_context.Reset();
     db->Get(read_options, key, &value);
+    hist_get_snapshot.Add(perf_context.get_snapshot_time);
+    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_get_post_process.Add(perf_context.get_post_process_time);
     hist_get.Add(perf_context.user_key_comparison_count);
   }
 
   std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
             << "Get uesr key comparison: \n" << hist_get.ToString();
-
+  std::cout << "Put(): Pre and Post Process Time: \n"
+            << hist_write_pre_post.ToString()
+            << " Writing WAL time: \n"
+            << hist_write_wal_time.ToString() << "\n"
+            << " Writing Mem Table time: \n"
+            << hist_write_memtable_time.ToString() << "\n";
+
+  std::cout << "Get(): Time to get snapshot: \n"
+            << hist_get_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_get_memtable.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n"
+            << hist_get_post_process.ToString() << "\n";
 }
 
 TEST(PerfContextTest, KeyComparisonCount) {
@@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
     db->Put(write_options, key, value);
     auto put_time = timer.ElapsedNanos();
     hist_put_time.Add(put_time);
-    hist_wal_time.Add(perf_context.wal_write_time);
-    hist_time_diff.Add(put_time - perf_context.wal_write_time);
+    hist_wal_time.Add(perf_context.write_wal_time);
+    hist_time_diff.Add(put_time - perf_context.write_wal_time);
   }
 
   std::cout << "Put time:\n" << hist_put_time.ToString()
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
new file mode 100644
index 000000000..0d554278c
--- /dev/null
+++ b/db/plain_table_db_test.cc
@@ -0,0 +1,337 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/plain_table_factory.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+
+class PlainTableDBTest {
+ protected:
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+  static std::unique_ptr<const SliceTransform> prefix_transform;
+
+ public:
+  PlainTableDBTest() : env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/plain_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~PlainTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(new PlainTableFactory(16, 2, 0.8));
+    options.prefix_extractor = prefix_transform.get();
+    options.allow_mmap_reads = true;
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+std::unique_ptr<const SliceTransform> PlainTableDBTest::prefix_transform(
+    NewFixedPrefixTransform(8));
+
+TEST(PlainTableDBTest, Empty) {
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+TEST(PlainTableDBTest, ReadWrite) {
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(PlainTableDBTest, Flush) {
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(PlainTableDBTest, Iterator) {
+  ASSERT_OK(Put("1000000000foo002", "v_2"));
+  ASSERT_OK(Put("0000000000000bar", "random"));
+  ASSERT_OK(Put("1000000000foo001", "v1"));
+  ASSERT_OK(Put("3000000000000bar", "bar_v"));
+  ASSERT_OK(Put("1000000000foo003", "v__3"));
+  ASSERT_OK(Put("1000000000foo004", "v__4"));
+  ASSERT_OK(Put("1000000000foo005", "v__5"));
+  ASSERT_OK(Put("1000000000foo007", "v__7"));
+  ASSERT_OK(Put("1000000000foo008", "v__8"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get("1000000000foo001"));
+  ASSERT_EQ("v__3", Get("1000000000foo003"));
+  ReadOptions ro;
+  Iterator* iter = dbfull()->NewIterator(ro);
+  iter->Seek("1000000000foo001");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo001", iter->key().ToString());
+  ASSERT_EQ("v1", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo002", iter->key().ToString());
+  ASSERT_EQ("v_2", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo003", iter->key().ToString());
+  ASSERT_EQ("v__3", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo004", iter->key().ToString());
+  ASSERT_EQ("v__4", iter->value().ToString());
+
+  iter->Seek("3000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+  ASSERT_EQ("bar_v", iter->value().ToString());
+
+  iter->Seek("1000000000foo000");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo001", iter->key().ToString());
+  ASSERT_EQ("v1", iter->value().ToString());
+
+  iter->Seek("1000000000foo005");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo006");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo007", iter->key().ToString());
+  ASSERT_EQ("v__7", iter->value().ToString());
+
+  iter->Seek("1000000000foo008");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Seek("1000000000foo009");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+
+  delete iter;
+}
+
+TEST(PlainTableDBTest, Flush2) {
+  ASSERT_OK(Put("0000000000000bar", "b"));
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_OK(Put("1000000000000foo", "v2"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v2", Get("1000000000000foo"));
+
+  ASSERT_OK(Put("0000000000000eee", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000eee"));
+
+  ASSERT_OK(Delete("0000000000000bar"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("0000000000000eee", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v5", Get("0000000000000eee"));
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(PlainTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 7e5e9cc0e..ca00c31b3 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -16,11 +16,15 @@ DEFINE_bool(trigger_deadlock, false,
 DEFINE_uint64(bucket_count, 100000, "number of buckets");
 DEFINE_uint64(num_locks, 10001, "number of locks");
 DEFINE_bool(random_prefix, false, "randomize prefix");
-DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
-DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
-DEFINE_int64(write_buffer_size, 1000000000, "");
-DEFINE_int64(max_write_buffer_number, 8, "");
-DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int64(max_write_buffer_number, 2, "");
+DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
+DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
+DEFINE_int32(value_size, 40, "");
 
 // Path to the database on file system
 const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
@@ -104,218 +108,265 @@ class PrefixTest {
     options.min_write_buffer_number_to_merge =
       FLAGS_min_write_buffer_number_to_merge;
 
-    options.comparator = new TestKeyComparator();
-    if (FLAGS_use_prefix_hash_memtable) {
-      auto prefix_extractor = NewFixedPrefixTransform(8);
-      options.prefix_extractor = prefix_extractor;
-      options.memtable_factory.reset(NewHashSkipListRepFactory(
-          prefix_extractor, FLAGS_bucket_count));
-    }
+    options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
+    options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
 
     Status s = DB::Open(options, kDbName,  &db);
     ASSERT_OK(s);
     return std::shared_ptr<DB>(db);
   }
+
+  bool NextOptions() {
+    // skip some options
+    option_config_++;
+    if (option_config_ < kEnd) {
+      auto prefix_extractor = NewFixedPrefixTransform(8);
+      options.prefix_extractor = prefix_extractor;
+      switch(option_config_) {
+        case kHashSkipList:
+          options.memtable_factory.reset(
+              NewHashSkipListRepFactory(options.prefix_extractor,
+                                        FLAGS_bucket_count,
+                                        FLAGS_skiplist_height));
+          return true;
+        case kHashLinkList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(options.prefix_extractor,
+                                        FLAGS_bucket_count));
+          return true;
+        default:
+          return false;
+      }
+    }
+    return false;
+  }
+
+  PrefixTest() : option_config_(kBegin) {
+    options.comparator = new TestKeyComparator();
+  }
   ~PrefixTest() {
     delete options.comparator;
   }
  protected:
+  enum OptionConfig {
+    kBegin,
+    kHashSkipList,
+    kHashLinkList,
+    kEnd
+  };
+  int option_config_;
   Options options;
 };
 
 TEST(PrefixTest, DynamicPrefixIterator) {
+  while (NextOptions()) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+        << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
 
-  DestroyDB(kDbName, Options());
-  auto db = OpenDb();
-  WriteOptions write_options;
-  ReadOptions read_options;
+    if (FLAGS_random_prefix) {
+      std::random_shuffle(prefixes.begin(), prefixes.end());
+    }
 
-  std::vector<uint64_t> prefixes;
-  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
-    prefixes.push_back(i);
-  }
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
 
-  if (FLAGS_random_prefix) {
-    std::random_shuffle(prefixes.begin(), prefixes.end());
-  }
+    // insert x random prefix, each with y continuous element.
+    for (auto prefix : prefixes) {
+       for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
 
-  // insert x random prefix, each with y continuous element.
-  for (auto prefix : prefixes) {
-     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
-      TestKey test_key(prefix, sorted);
+        Slice key = TestKeyToSlice(test_key);
+        std::string value(FLAGS_value_size, 0);
 
-      Slice key = TestKeyToSlice(test_key);
-      std::string value = "v" + std::to_string(sorted);
+        perf_context.Reset();
+        StopWatchNano timer(Env::Default(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(perf_context.user_key_comparison_count);
+      }
+    }
+
+    std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+              << "Put time: \n" << hist_put_time.ToString();
 
-      ASSERT_OK(db->Put(write_options, key, value));
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
+
+    if (FLAGS_use_prefix_hash_memtable) {
+      read_options.prefix_seek = true;
     }
-  }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
-  // test seek existing keys
-  HistogramImpl hist_seek_time;
-  HistogramImpl hist_seek_comparison;
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(0);
 
-  if (FLAGS_use_prefix_hash_memtable) {
-    read_options.prefix_seek = true;
-  }
-  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
-
-  for (auto prefix : prefixes) {
-    TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
-    Slice key = TestKeyToSlice(test_key);
-    std::string value = "v" + std::to_string(0);
-
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    uint64_t total_keys = 0;
-    for (iter->Seek(key); iter->Valid(); iter->Next()) {
-      if (FLAGS_trigger_deadlock) {
-        std::cout << "Behold the deadlock!\n";
-        db->Delete(write_options, iter->key());
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key); iter->Valid(); iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        auto test_key = SliceToTestKey(iter->key());
+        if (test_key->prefix != prefix) break;
+        total_keys++;
       }
-      auto test_key = SliceToTestKey(iter->key());
-      if (test_key->prefix != prefix) break;
-      total_keys++;
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
     }
-    hist_seek_time.Add(timer.ElapsedNanos());
-    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
-  }
 
-  std::cout << "Seek key comparison: \n"
-            << hist_seek_comparison.ToString()
-            << "Seek time: \n"
-            << hist_seek_time.ToString();
-
-  // test non-existing keys
-  HistogramImpl hist_no_seek_time;
-  HistogramImpl hist_no_seek_comparison;
-
-  for (auto prefix = FLAGS_total_prefixes;
-       prefix < FLAGS_total_prefixes + 100;
-       prefix++) {
-    TestKey test_key(prefix, 0);
-    Slice key = TestKeyToSlice(test_key);
-
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    iter->Seek(key);
-    hist_no_seek_time.Add(timer.ElapsedNanos());
-    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_TRUE(!iter->Valid());
-  }
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString()
+              << "Seek time: \n"
+              << hist_seek_time.ToString();
 
-  std::cout << "non-existing Seek key comparison: \n"
-            << hist_no_seek_comparison.ToString()
-            << "non-existing Seek time: \n"
-            << hist_no_seek_time.ToString();
-}
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
 
-TEST(PrefixTest, PrefixHash) {
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 10000;
+         prefix++) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
 
-  DestroyDB(kDbName, Options());
-  auto db = OpenDb();
-  WriteOptions write_options;
-  ReadOptions read_options;
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
+    }
 
-  std::vector<uint64_t> prefixes;
-  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
-    prefixes.push_back(i);
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
   }
+}
 
-  if (FLAGS_random_prefix) {
-    std::random_shuffle(prefixes.begin(), prefixes.end());
-  }
+TEST(PrefixTest, PrefixHash) {
+  while (NextOptions()) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+        << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
 
-  // insert x random prefix, each with y continuous element.
-  HistogramImpl hist_put_time;
-  HistogramImpl hist_put_comparison;
+    if (FLAGS_random_prefix) {
+      std::random_shuffle(prefixes.begin(), prefixes.end());
+    }
 
-  for (auto prefix : prefixes) {
-     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
-      TestKey test_key(prefix, sorted);
+    // insert x random prefix, each with y continuous element.
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
 
-      Slice key = TestKeyToSlice(test_key);
-      std::string value = "v" + std::to_string(sorted);
+    for (auto prefix : prefixes) {
+       for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
 
-      perf_context.Reset();
-      StopWatchNano timer(Env::Default(), true);
-      ASSERT_OK(db->Put(write_options, key, value));
-      hist_put_time.Add(timer.ElapsedNanos());
-      hist_put_comparison.Add(perf_context.user_key_comparison_count);
+        Slice key = TestKeyToSlice(test_key);
+        std::string value = "v" + std::to_string(sorted);
+
+        perf_context.Reset();
+        StopWatchNano timer(Env::Default(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(perf_context.user_key_comparison_count);
+      }
     }
-  }
 
-  std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
-            << "Put time: \n" << hist_put_time.ToString();
+    std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+              << "Put time: \n" << hist_put_time.ToString();
 
 
-  // test seek existing keys
-  HistogramImpl hist_seek_time;
-  HistogramImpl hist_seek_comparison;
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
 
-  for (auto prefix : prefixes) {
-    TestKey test_key(prefix, 0);
-    Slice key = TestKeyToSlice(test_key);
-    std::string value = "v" + std::to_string(0);
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(0);
 
-    Slice key_prefix;
-    if (FLAGS_use_prefix_hash_memtable) {
-      key_prefix = options.prefix_extractor->Transform(key);
-      read_options.prefix = &key_prefix;
-    }
-    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      Slice key_prefix;
+      if (FLAGS_use_prefix_hash_memtable) {
+        key_prefix = options.prefix_extractor->Transform(key);
+        read_options.prefix = &key_prefix;
+      }
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    uint64_t total_keys = 0;
-    for (iter->Seek(key); iter->Valid(); iter->Next()) {
-      if (FLAGS_trigger_deadlock) {
-        std::cout << "Behold the deadlock!\n";
-        db->Delete(write_options, iter->key());
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key); iter->Valid(); iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        auto test_key = SliceToTestKey(iter->key());
+        if (test_key->prefix != prefix) break;
+        total_keys++;
       }
-      auto test_key = SliceToTestKey(iter->key());
-      if (test_key->prefix != prefix) break;
-      total_keys++;
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
     }
-    hist_seek_time.Add(timer.ElapsedNanos());
-    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
-  }
 
-  std::cout << "Seek key comparison: \n"
-            << hist_seek_comparison.ToString()
-            << "Seek time: \n"
-            << hist_seek_time.ToString();
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString()
+              << "Seek time: \n"
+              << hist_seek_time.ToString();
 
-  // test non-existing keys
-  HistogramImpl hist_no_seek_time;
-  HistogramImpl hist_no_seek_comparison;
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
 
-  for (auto prefix = FLAGS_total_prefixes;
-       prefix < FLAGS_total_prefixes + 100;
-       prefix++) {
-    TestKey test_key(prefix, 0);
-    Slice key = TestKeyToSlice(test_key);
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 100;
+         prefix++) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
 
-    if (FLAGS_use_prefix_hash_memtable) {
-      Slice key_prefix = options.prefix_extractor->Transform(key);
-      read_options.prefix = &key_prefix;
+      if (FLAGS_use_prefix_hash_memtable) {
+        Slice key_prefix = options.prefix_extractor->Transform(key);
+        read_options.prefix = &key_prefix;
+      }
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
     }
-    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
-    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
-    iter->Seek(key);
-    hist_no_seek_time.Add(timer.ElapsedNanos());
-    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
-    ASSERT_TRUE(!iter->Valid());
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
   }
-
-  std::cout << "non-existing Seek key comparison: \n"
-            << hist_no_seek_comparison.ToString()
-            << "non-existing Seek time: \n"
-            << hist_no_seek_time.ToString();
 }
 
 }
diff --git a/db/repair.cc b/db/repair.cc
index 72387a71d..ed11870b0 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -231,10 +231,8 @@ class Repairer {
     FileMetaData meta;
     meta.number = next_file_number_++;
     Iterator* iter = mem->NewIterator();
-    status = BuildTable(dbname_, env_, options_, storage_options_,
-                        table_cache_, iter, &meta,
-                        icmp_.user_comparator(), 0, 0,
-                        kNoCompression);
+    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
+                        iter, &meta, icmp_, 0, 0, kNoCompression);
     delete iter;
     delete mem->Unref();
     delete cf_mems_default;
@@ -275,8 +273,9 @@ class Repairer {
     int counter = 0;
     Status status = env_->GetFileSize(fname, &t->meta.file_size);
     if (status.ok()) {
+      FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
       Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
+          ReadOptions(), storage_options_, icmp_, dummy_meta);
       bool empty = true;
       ParsedInternalKey parsed;
       t->min_sequence = 0;
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index 0f3b89d9b..3d1420c0c 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -22,6 +22,8 @@
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
@@ -31,6 +33,7 @@
 
 using std::unique_ptr;
 
+// IS THIS FILE STILL NEEDED?
 namespace rocksdb {
 
 // SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
@@ -84,15 +87,13 @@ public:
 
   Iterator* NewIterator(const ReadOptions&) override;
 
-  Status Get(
-      const ReadOptions&, const Slice& key, void* arg,
-      bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
-      void (*mark_key_may_exist)(void*) = nullptr) override;
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
+                                   const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr) override;
 
   uint64_t ApproximateOffsetOf(const Slice& key) override;
 
-  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
-
   void SetupForCompaction() override;
 
   TableProperties& GetTableProperties() override;
@@ -244,7 +245,8 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
       return s;
     }
 
-    int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
+    InternalKeyComparator ikc(rep_->options.comparator);
+    int compare_result = ikc.Compare(tmp_slice, target);
 
     if (compare_result < 0) {
       if (left == right) {
@@ -279,14 +281,20 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
   return s;
 }
 
-Status SimpleTableReader::Get(
-    const ReadOptions& options, const Slice& k, void* arg,
-    bool (*saver)(void*, const Slice&, const Slice&, bool),
-    void (*mark_key_may_exist)(void*)) {
+Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
+                              void* arg,
+                              bool (*saver)(void*, const ParsedInternalKey&,
+                                            const Slice&, bool),
+                              void (*mark_key_may_exist)(void*)) {
   Status s;
   SimpleTableIterator* iter = new SimpleTableIterator(this);
   for (iter->Seek(k); iter->Valid(); iter->Next()) {
-    if (!(*saver)(arg, iter->key(), iter->value(), true)) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter->key(), &parsed_key)) {
+      return Status::Corruption(Slice());
+    }
+
+    if (!(*saver)(arg, parsed_key, iter->value(), true)) {
       break;
     }
   }
@@ -295,11 +303,6 @@ Status SimpleTableReader::Get(
   return s;
 }
 
-bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
-                                        const Slice& key) {
-  return false;
-}
-
 uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
   return 0;
 }
@@ -540,27 +543,30 @@ public:
   const char* Name() const override {
     return "SimpleTable";
   }
-  Status GetTableReader(const Options& options, const EnvOptions& soptions,
-                        unique_ptr<RandomAccessFile> && file,
-                        uint64_t file_size,
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_key,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const;
 
-  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& internal_key,
+                                WritableFile* file,
                                 CompressionType compression_type) const;
 };
 
-Status SimpleTableFactory::GetTableReader(
+Status SimpleTableFactory::NewTableReader(
     const Options& options, const EnvOptions& soptions,
-    unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+    const InternalKeyComparator& internal_key,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
 
   return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
                                  table_reader);
 }
 
-TableBuilder* SimpleTableFactory::GetTableBuilder(
-    const Options& options, WritableFile* file,
-    CompressionType compression_type) const {
+TableBuilder* SimpleTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_key,
+    WritableFile* file, CompressionType compression_type) const {
   return new SimpleTableBuilder(options, file, compression_type);
 }
 
diff --git a/db/skiplist.h b/db/skiplist.h
index 2c9c4a6de..e713fe42a 100644
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -34,8 +34,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "port/port.h"
+#include "util/arena.h"
 #include "util/random.h"
-#include "rocksdb/arena.h"
 
 namespace rocksdb {
 
@@ -48,7 +48,8 @@ class SkipList {
   // Create a new SkipList object that will use "cmp" for comparing keys,
   // and will allocate memory using "*arena".  Objects allocated in the arena
   // must remain allocated for the lifetime of the skiplist object.
-  explicit SkipList(Comparator cmp, Arena* arena);
+  explicit SkipList(Comparator cmp, Arena* arena,
+                    int32_t max_height = 12, int32_t branching_factor = 4);
 
   // Insert key into the list.
   // REQUIRES: nothing that compares equal to key is currently in the list.
@@ -102,7 +103,8 @@ class SkipList {
   };
 
  private:
-  enum { kMaxHeight = 12 };
+  const int32_t kMaxHeight_;
+  const int32_t kBranching_;
 
   // Immutable after construction
   Comparator const compare_;
@@ -115,8 +117,8 @@ class SkipList {
   port::AtomicPointer max_height_;   // Height of the entire list
 
   // Used for optimizing sequential insert patterns
-  Node* prev_[kMaxHeight];
-  int   prev_height_;
+  Node** prev_;
+  int32_t prev_height_;
 
   inline int GetMaxHeight() const {
     return static_cast<int>(
@@ -258,13 +260,12 @@ inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
 template<typename Key, class Comparator>
 int SkipList<Key,Comparator>::RandomHeight() {
   // Increase height with probability 1 in kBranching
-  static const unsigned int kBranching = 4;
   int height = 1;
-  while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+  while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
     height++;
   }
   assert(height > 0);
-  assert(height <= kMaxHeight);
+  assert(height <= kMaxHeight_);
   return height;
 }
 
@@ -354,14 +355,24 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
 }
 
 template<typename Key, class Comparator>
-SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
-    : compare_(cmp),
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena,
+                                   int32_t max_height,
+                                   int32_t branching_factor)
+    : kMaxHeight_(max_height),
+      kBranching_(branching_factor),
+      compare_(cmp),
       arena_(arena),
-      head_(NewNode(0 /* any key will do */, kMaxHeight)),
+      head_(NewNode(0 /* any key will do */, max_height)),
       max_height_(reinterpret_cast<void*>(1)),
       prev_height_(1),
       rnd_(0xdeadbeef) {
-  for (int i = 0; i < kMaxHeight; i++) {
+  assert(kMaxHeight_ > 0);
+  assert(kBranching_ > 0);
+  // Allocate the prev_ Node* array, directly from the passed-in arena.
+  // prev_ does not need to be freed, as its life cycle is tied up with
+  // the arena as a whole.
+  prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
+  for (int i = 0; i < kMaxHeight_; i++) {
     head_->SetNext(i, nullptr);
     prev_[i] = head_;
   }
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index dcbaf0abb..b87ddcbb0 100644
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -10,7 +10,7 @@
 #include "db/skiplist.h"
 #include <set>
 #include "rocksdb/env.h"
-#include "util/arena_impl.h"
+#include "util/arena.h"
 #include "util/hash.h"
 #include "util/random.h"
 #include "util/testharness.h"
@@ -34,9 +34,9 @@ struct TestComparator {
 class SkipTest { };
 
 TEST(SkipTest, Empty) {
-  ArenaImpl arena_impl;
+  Arena arena;
   TestComparator cmp;
-  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  SkipList<Key, TestComparator> list(cmp, &arena);
   ASSERT_TRUE(!list.Contains(10));
 
   SkipList<Key, TestComparator>::Iterator iter(&list);
@@ -54,9 +54,9 @@ TEST(SkipTest, InsertAndLookup) {
   const int R = 5000;
   Random rnd(1000);
   std::set<Key> keys;
-  ArenaImpl arena_impl;
+  Arena arena;
   TestComparator cmp;
-  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  SkipList<Key, TestComparator> list(cmp, &arena);
   for (int i = 0; i < N; i++) {
     Key key = rnd.Next() % R;
     if (keys.insert(key).second) {
@@ -209,14 +209,14 @@ class ConcurrentTest {
   // Current state of the test
   State current_;
 
-  ArenaImpl arena_impl_;
+  Arena arena_;
 
   // SkipList is not protected by mu_.  We just use a single writer
   // thread to modify it.
   SkipList<Key, TestComparator> list_;
 
  public:
-  ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
+  ConcurrentTest() : list_(TestComparator(), &arena_) {}
 
   // REQUIRES: External synchronization
   void WriteStep(Random* rnd) {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index adf94182d..3301b98d9 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -10,9 +10,10 @@
 #include "db/table_cache.h"
 
 #include "db/filename.h"
+#include "db/version_edit.h"
 
 #include "rocksdb/statistics.h"
-#include "rocksdb/table.h"
+#include "table/table_reader.h"
 #include "util/coding.h"
 #include "util/stop_watch.h"
 
@@ -34,7 +35,6 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) {
                sizeof(*file_number));
 }
 
-// TODO(icanadi) Options -> DBOptions
 TableCache::TableCache(const std::string& dbname, const Options* options,
                        const EnvOptions& storage_options, Cache* const cache)
     : env_(options->env),
@@ -46,7 +46,16 @@ TableCache::TableCache(const std::string& dbname, const Options* options,
 TableCache::~TableCache() {
 }
 
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+  return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+  cache_->Release(handle);
+}
+
 Status TableCache::FindTable(const EnvOptions& toptions,
+                             const InternalKeyComparator& internal_comparator,
                              uint64_t file_number, uint64_t file_size,
                              Cache::Handle** handle, bool* table_io,
                              const bool no_io) {
@@ -70,8 +79,9 @@ Status TableCache::FindTable(const EnvOptions& toptions,
         file->Hint(RandomAccessFile::RANDOM);
       }
       StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
-      s = options_->table_factory->GetTableReader(
-          *options_, toptions, std::move(file), file_size, &table_reader);
+      s = options_->table_factory->NewTableReader(
+          *options_, toptions, internal_comparator, std::move(file), file_size,
+          &table_reader);
     }
 
     if (!s.ok()) {
@@ -89,25 +99,28 @@ Status TableCache::FindTable(const EnvOptions& toptions,
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                   const EnvOptions& toptions,
-                                  uint64_t file_number,
-                                  uint64_t file_size,
+                                  const InternalKeyComparator& icomparator,
+                                  const FileMetaData& file_meta,
                                   TableReader** table_reader_ptr,
                                   bool for_compaction) {
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = nullptr;
   }
-
-  Cache::Handle* handle = nullptr;
-  Status s = FindTable(toptions, file_number, file_size, &handle,
-                       nullptr, options.read_tier == kBlockCacheTier);
+  Cache::Handle* handle = file_meta.table_reader_handle;
+  Status s;
+  if (!handle) {
+    s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
+                  &handle, nullptr, options.read_tier == kBlockCacheTier);
+  }
   if (!s.ok()) {
     return NewErrorIterator(s);
   }
 
-  TableReader* table_reader =
-    reinterpret_cast<TableReader*>(cache_->Value(handle));
+  TableReader* table_reader = GetTableReaderFromHandle(handle);
   Iterator* result = table_reader->NewIterator(options);
-  result->RegisterCleanup(&UnrefEntry, cache_, handle);
+  if (!file_meta.table_reader_handle) {
+    result->RegisterCleanup(&UnrefEntry, cache_, handle);
+  }
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = table_reader;
   }
@@ -120,22 +133,24 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 }
 
 Status TableCache::Get(const ReadOptions& options,
-                       uint64_t file_number,
-                       uint64_t file_size,
-                       const Slice& k,
-                       void* arg,
-                       bool (*saver)(void*, const Slice&, const Slice&, bool),
-                       bool* table_io,
-                       void (*mark_key_may_exist)(void*)) {
-  Cache::Handle* handle = nullptr;
-  Status s = FindTable(storage_options_, file_number, file_size,
-                       &handle, table_io,
-                       options.read_tier == kBlockCacheTier);
+                       const InternalKeyComparator& internal_comparator,
+                       const FileMetaData& file_meta, const Slice& k, void* arg,
+                       bool (*saver)(void*, const ParsedInternalKey&,
+                                     const Slice&, bool),
+                       bool* table_io, void (*mark_key_may_exist)(void*)) {
+  Cache::Handle* handle = file_meta.table_reader_handle;
+  Status s;
+  if (!handle) {
+    s = FindTable(storage_options_, internal_comparator, file_meta.number,
+                  file_meta.file_size, &handle, table_io,
+                  options.read_tier == kBlockCacheTier);
+  }
   if (s.ok()) {
-    TableReader* t =
-      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    TableReader* t = GetTableReaderFromHandle(handle);
     s = t->Get(options, k, arg, saver, mark_key_may_exist);
-    cache_->Release(handle);
+    if (!file_meta.table_reader_handle) {
+      ReleaseHandle(handle);
+    }
   } else if (options.read_tier && s.IsIncomplete()) {
     // Couldnt find Table in cache but treat as kFound if no_io set
     (*mark_key_may_exist)(arg);
@@ -145,19 +160,17 @@ Status TableCache::Get(const ReadOptions& options,
 }
 
 bool TableCache::PrefixMayMatch(const ReadOptions& options,
-                                uint64_t file_number,
-                                uint64_t file_size,
-                                const Slice& internal_prefix,
-                                bool* table_io) {
+                                const InternalKeyComparator& icomparator,
+                                uint64_t file_number, uint64_t file_size,
+                                const Slice& internal_prefix, bool* table_io) {
   Cache::Handle* handle = nullptr;
-  Status s = FindTable(storage_options_, file_number,
-                       file_size, &handle, table_io);
+  Status s = FindTable(storage_options_, icomparator, file_number, file_size,
+                       &handle, table_io);
   bool may_match = true;
   if (s.ok()) {
-    TableReader* t =
-      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    TableReader* t = GetTableReaderFromHandle(handle);
     may_match = t->PrefixMayMatch(internal_prefix);
-    cache_->Release(handle);
+    ReleaseHandle(handle);
   }
   return may_match;
 }
diff --git a/db/table_cache.h b/db/table_cache.h
index 9807aeb00..44f47e353 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -12,15 +12,18 @@
 #pragma once
 #include <string>
 #include <stdint.h>
+
 #include "db/dbformat.h"
-#include "rocksdb/env.h"
-#include "rocksdb/cache.h"
 #include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "table/table_reader.h"
 
 namespace rocksdb {
 
 class Env;
+struct FileMetaData;
 
 class TableCache {
  public:
@@ -35,10 +38,9 @@ class TableCache {
   // the returned iterator.  The returned "*tableptr" object is owned by
   // the cache and should not be deleted, and is valid for as long as the
   // returned iterator is live.
-  Iterator* NewIterator(const ReadOptions& options,
-                        const EnvOptions& toptions,
-                        uint64_t file_number,
-                        uint64_t file_size,
+  Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
+                        const InternalKeyComparator& internal_comparator,
+                        const FileMetaData& file_meta,
                         TableReader** table_reader_ptr = nullptr,
                         bool for_compaction = false);
 
@@ -46,33 +48,40 @@ class TableCache {
   // call (*handle_result)(arg, found_key, found_value) repeatedly until
   // it returns false.
   Status Get(const ReadOptions& options,
-             uint64_t file_number,
-             uint64_t file_size,
-             const Slice& k,
-             void* arg,
-             bool (*handle_result)(void*, const Slice&, const Slice&, bool),
-             bool* table_io,
-             void (*mark_key_may_exist)(void*) = nullptr);
+             const InternalKeyComparator& internal_comparator,
+             const FileMetaData& file_meta, const Slice& k, void* arg,
+             bool (*handle_result)(void*, const ParsedInternalKey&,
+                                   const Slice&, bool),
+             bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
 
   // Determine whether the table may contain the specified prefix.  If
-  // the table index of blooms are not in memory, this may cause an I/O
-  bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
-                      uint64_t file_size, const Slice& internal_prefix,
-                      bool* table_io);
+  // the table index or blooms are not in memory, this may cause an I/O
+  bool PrefixMayMatch(const ReadOptions& options,
+                      const InternalKeyComparator& internal_comparator,
+                      uint64_t file_number, uint64_t file_size,
+                      const Slice& internal_prefix, bool* table_io);
 
   // Evict any entry for the specified file number
   static void Evict(Cache* cache, uint64_t file_number);
 
+  // Find table reader
+  Status FindTable(const EnvOptions& toptions,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_number, uint64_t file_size, Cache::Handle**,
+                   bool* table_io = nullptr, const bool no_io = false);
+
+  // Get TableReader from a cache handle.
+  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+  // Release the handle from a cache
+  void ReleaseHandle(Cache::Handle* handle);
+
  private:
   Env* const env_;
   const std::string dbname_;
   const Options* options_;
   const EnvOptions& storage_options_;
   Cache* const cache_;
-
-  Status FindTable(const EnvOptions& toptions, uint64_t file_number,
-                   uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
-                   const bool no_io = false);
 };
 
 }  // namespace rocksdb
diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc
index 3654663c1..25bd70036 100644
--- a/db/table_properties_collector.cc
+++ b/db/table_properties_collector.cc
@@ -10,87 +10,6 @@
 
 namespace rocksdb {
 
-namespace {
-  void AppendProperty(
-      std::string& props,
-      const std::string& key,
-      const std::string& value,
-      const std::string& prop_delim,
-      const std::string& kv_delim) {
-    props.append(key);
-    props.append(kv_delim);
-    props.append(value);
-    props.append(prop_delim);
-  }
-
-  template <class TValue>
-  void AppendProperty(
-      std::string& props,
-      const std::string& key,
-      const TValue& value,
-      const std::string& prop_delim,
-      const std::string& kv_delim) {
-    AppendProperty(
-        props, key, std::to_string(value), prop_delim, kv_delim
-    );
-  }
-}
-
-std::string TableProperties::ToString(
-    const std::string& prop_delim,
-    const std::string& kv_delim) const {
-  std::string result;
-  result.reserve(1024);
-
-  // Basic Info
-  AppendProperty(
-      result, "# data blocks", num_data_blocks, prop_delim, kv_delim
-  );
-  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
-
-  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
-  AppendProperty(
-      result,
-      "raw average key size",
-      num_entries != 0 ?  1.0 * raw_key_size / num_entries : 0.0,
-      prop_delim,
-      kv_delim
-  );
-  AppendProperty(
-      result, "raw value size", raw_value_size, prop_delim, kv_delim
-  );
-  AppendProperty(
-      result,
-      "raw average value size",
-      num_entries != 0 ?  1.0 * raw_value_size / num_entries : 0.0,
-      prop_delim,
-      kv_delim
-  );
-
-  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
-  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
-  AppendProperty(
-      result, "filter block size", filter_size, prop_delim, kv_delim
-  );
-  AppendProperty(
-      result,
-      "(estimated) table size",
-      data_size + index_size + filter_size,
-      prop_delim,
-      kv_delim
-  );
-
-  AppendProperty(
-      result,
-      "filter policy name",
-      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
-      prop_delim,
-      kv_delim
-  );
-
-  return result;
-}
-
 Status InternalKeyPropertiesCollector::Add(
     const Slice& key, const Slice& value) {
   ParsedInternalKey ikey;
@@ -106,7 +25,7 @@ Status InternalKeyPropertiesCollector::Add(
 }
 
 Status InternalKeyPropertiesCollector::Finish(
-    TableProperties::UserCollectedProperties* properties) {
+    UserCollectedProperties* properties) {
   assert(properties);
   assert(properties->find(
         InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
@@ -118,7 +37,7 @@ Status InternalKeyPropertiesCollector::Finish(
   return Status::OK();
 }
 
-TableProperties::UserCollectedProperties
+UserCollectedProperties
 InternalKeyPropertiesCollector::GetReadableProperties() const {
   return {
     { "kDeletedKeys", std::to_string(deleted_keys_) }
@@ -137,11 +56,11 @@ Status UserKeyTablePropertiesCollector::Add(
 }
 
 Status UserKeyTablePropertiesCollector::Finish(
-    TableProperties::UserCollectedProperties* properties) {
+    UserCollectedProperties* properties) {
   return collector_->Finish(properties);
 }
 
-TableProperties::UserCollectedProperties
+UserCollectedProperties
 UserKeyTablePropertiesCollector::GetReadableProperties() const {
   return collector_->GetReadableProperties();
 }
@@ -151,7 +70,7 @@ const std::string InternalKeyTablePropertiesNames::kDeletedKeys
   = "rocksdb.deleted.keys";
 
 uint64_t GetDeletedKeys(
-    const TableProperties::UserCollectedProperties& props) {
+    const UserCollectedProperties& props) {
   auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
   if (pos == props.end()) {
     return 0;
diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h
index 533130db7..6cf56291a 100644
--- a/db/table_properties_collector.h
+++ b/db/table_properties_collector.h
@@ -24,15 +24,13 @@ class InternalKeyPropertiesCollector : public TablePropertiesCollector {
  public:
   virtual Status Add(const Slice& key, const Slice& value) override;
 
-  virtual Status Finish(
-      TableProperties::UserCollectedProperties* properties) override;
+  virtual Status Finish(UserCollectedProperties* properties) override;
 
   virtual const char* Name() const override {
     return "InternalKeyPropertiesCollector";
   }
 
-  TableProperties::UserCollectedProperties
-    GetReadableProperties() const override;
+  UserCollectedProperties GetReadableProperties() const override;
 
  private:
   uint64_t deleted_keys_ = 0;
@@ -61,13 +59,11 @@ class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
 
   virtual Status Add(const Slice& key, const Slice& value) override;
 
-  virtual Status Finish(
-      TableProperties::UserCollectedProperties* properties) override;
+  virtual Status Finish(UserCollectedProperties* properties) override;
 
   virtual const char* Name() const override { return collector_->Name(); }
 
-  TableProperties::UserCollectedProperties
-    GetReadableProperties() const override;
+  UserCollectedProperties GetReadableProperties() const override;
 
  protected:
   std::shared_ptr<TablePropertiesCollector> collector_;
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 6f405b28a..961a7302b 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -7,12 +7,14 @@
 #include <memory>
 #include <string>
 
-#include "db/dbformat.h"
 #include "db/db_impl.h"
+#include "db/dbformat.h"
 #include "db/table_properties_collector.h"
-#include "rocksdb/table_properties.h"
 #include "rocksdb/table.h"
 #include "table/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/table_builder.h"
 #include "util/coding.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -20,8 +22,6 @@
 namespace rocksdb {
 
 class TablePropertiesTest {
- private:
-  unique_ptr<TableReader> table_reader_;
 };
 
 // TODO(kailiu) the following classes should be moved to some more general
@@ -83,30 +83,13 @@ class DumbLogger : public Logger {
 };
 
 // Utilities test functions
-void MakeBuilder(
-    const Options& options,
-    std::unique_ptr<FakeWritableFile>* writable,
-    std::unique_ptr<TableBuilder>* builder) {
+void MakeBuilder(const Options& options,
+                 const InternalKeyComparator& internal_comparator,
+                 std::unique_ptr<FakeWritableFile>* writable,
+                 std::unique_ptr<TableBuilder>* builder) {
   writable->reset(new FakeWritableFile);
-  builder->reset(
-      options.table_factory->GetTableBuilder(options, writable->get(),
-                                             options.compression));
-}
-
-void OpenTable(
-    const Options& options,
-    const std::string& contents,
-    std::unique_ptr<TableReader>* table_reader) {
-
-  std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
-  auto s = options.table_factory->GetTableReader(
-      options,
-      EnvOptions(),
-      std::move(file),
-      contents.size(),
-      table_reader
-  );
-  ASSERT_OK(s);
+  builder->reset(options.table_factory->NewTableBuilder(
+      options, internal_comparator, writable->get(), options.compression));
 }
 
 // Collects keys that starts with "A" in a table.
@@ -114,10 +97,10 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
  public:
    const char* Name() const { return "RegularKeysStartWithA"; }
 
-   Status Finish(TableProperties::UserCollectedProperties* properties) {
+   Status Finish(UserCollectedProperties* properties) {
      std::string encoded;
      PutVarint32(&encoded, count_);
-     *properties = TableProperties::UserCollectedProperties {
+     *properties = UserCollectedProperties {
        { "TablePropertiesTest", "Rocksdb" },
        { "Count", encoded }
      };
@@ -132,8 +115,7 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
      return Status::OK();
    }
 
-  virtual TableProperties::UserCollectedProperties
-    GetReadableProperties() const {
+  virtual UserCollectedProperties GetReadableProperties() const {
       return {};
   }
 
@@ -142,23 +124,65 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
   uint32_t count_ = 0;
 };
 
-TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
-  Options options;
-
+extern uint64_t kBlockBasedTableMagicNumber;
+extern uint64_t kPlainTableMagicNumber;
+void TestCustomizedTablePropertiesCollector(
+    uint64_t magic_number, bool encode_as_internal, const Options& options,
+    const InternalKeyComparator& internal_comparator) {
   // make sure the entries will be inserted with order.
   std::map<std::string, std::string> kvs = {
-    {"About",     "val5"},  // starts with 'A'
-    {"Abstract",  "val2"},  // starts with 'A'
-    {"Around",    "val7"},  // starts with 'A'
-    {"Beyond",    "val3"},
-    {"Builder",   "val1"},
-    {"Cancel",    "val4"},
-    {"Find",      "val6"},
+    {"About   ", "val5"},  // starts with 'A'
+    {"Abstract", "val2"},  // starts with 'A'
+    {"Around  ", "val7"},  // starts with 'A'
+    {"Beyond  ", "val3"},
+    {"Builder ", "val1"},
+    {"Cancel  ", "val4"},
+    {"Find    ", "val6"},
   };
 
+  // -- Step 1: build table
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  MakeBuilder(options, internal_comparator, &writable, &builder);
+
+  for (const auto& kv : kvs) {
+    if (encode_as_internal) {
+      InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
+      builder->Add(ikey.Encode(), kv.second);
+    } else {
+      builder->Add(kv.first, kv.second);
+    }
+  }
+  ASSERT_OK(builder->Finish());
+
+  // -- Step 2: Read properties
+  FakeRandomeAccessFile readable(writable->contents());
+  TableProperties props;
+  Status s = ReadTableProperties(
+      &readable,
+      writable->contents().size(),
+      magic_number,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  ASSERT_OK(s);
+
+  auto user_collected = props.user_collected_properties;
+
+  ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+  uint32_t starts_with_A = 0;
+  Slice key(user_collected.at("Count"));
+  ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+  ASSERT_EQ(3u, starts_with_A);
+}
+
+TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
   // Test properties collectors with internal keys or regular keys
+  // for block based table
   for (bool encode_as_internal : { true, false }) {
-    // -- Step 1: build table
+    Options options;
     auto collector = new RegularKeysStartWithA();
     if (encode_as_internal) {
       options.table_properties_collectors = {
@@ -168,97 +192,111 @@ TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
       options.table_properties_collectors.resize(1);
       options.table_properties_collectors[0].reset(collector);
     }
-    std::unique_ptr<TableBuilder> builder;
-    std::unique_ptr<FakeWritableFile> writable;
-    MakeBuilder(options, &writable, &builder);
-
-    for (const auto& kv : kvs) {
-      if (encode_as_internal) {
-        InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
-        builder->Add(ikey.Encode(), kv.second);
-      } else {
-        builder->Add(kv.first, kv.second);
-      }
-    }
-    ASSERT_OK(builder->Finish());
-
-    // -- Step 2: Open table
-    std::unique_ptr<TableReader> table_reader;
-    OpenTable(options, writable->contents(), &table_reader);
-    const auto& properties =
-      table_reader->GetTableProperties().user_collected_properties;
-
-    ASSERT_EQ("Rocksdb", properties.at("TablePropertiesTest"));
-
-    uint32_t starts_with_A = 0;
-    Slice key(properties.at("Count"));
-    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
-    ASSERT_EQ(3u, starts_with_A);
+    test::PlainInternalKeyComparator ikc(options.comparator);
+    TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
+                                           encode_as_internal, options, ikc);
   }
+
+  // test plain table
+  Options options;
+  options.table_properties_collectors.push_back(
+      std::make_shared<RegularKeysStartWithA>()
+  );
+  options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
+  test::PlainInternalKeyComparator ikc(options.comparator);
+  TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
+                                         ikc);
 }
 
-TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+void TestInternalKeyPropertiesCollector(
+    uint64_t magic_number,
+    bool sanitized,
+    std::shared_ptr<TableFactory> table_factory) {
   InternalKey keys[] = {
-    InternalKey("A", 0, ValueType::kTypeValue),
-    InternalKey("B", 0, ValueType::kTypeValue),
-    InternalKey("C", 0, ValueType::kTypeValue),
-    InternalKey("W", 0, ValueType::kTypeDeletion),
-    InternalKey("X", 0, ValueType::kTypeDeletion),
-    InternalKey("Y", 0, ValueType::kTypeDeletion),
-    InternalKey("Z", 0, ValueType::kTypeDeletion),
+    InternalKey("A       ", 0, ValueType::kTypeValue),
+    InternalKey("B       ", 0, ValueType::kTypeValue),
+    InternalKey("C       ", 0, ValueType::kTypeValue),
+    InternalKey("W       ", 0, ValueType::kTypeDeletion),
+    InternalKey("X       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Y       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Z       ", 0, ValueType::kTypeDeletion),
   };
 
-  for (bool sanitized : { false, true }) {
-    std::unique_ptr<TableBuilder> builder;
-    std::unique_ptr<FakeWritableFile> writable;
-    Options options;
-    if (sanitized) {
-      options.table_properties_collectors = {
-        std::make_shared<RegularKeysStartWithA>()
-      };
-      // with sanitization, even regular properties collector will be able to
-      // handle internal keys.
-      auto comparator = options.comparator;
-      // HACK: Set options.info_log to avoid writing log in
-      // SanitizeOptions().
-      options.info_log = std::make_shared<DumbLogger>();
-      options = SanitizeOptions(
-          "db",  // just a place holder
-          nullptr,  // with skip internal key comparator
-          nullptr,  // don't care filter policy
-          options
-      );
-      options.comparator = comparator;
-    } else {
-      options.table_properties_collectors = {
-        std::make_shared<InternalKeyPropertiesCollector>()
-      };
-    }
-
-    MakeBuilder(options, &writable, &builder);
-    for (const auto& k : keys) {
-      builder->Add(k.Encode(), "val");
-    }
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  Options options;
+  test::PlainInternalKeyComparator pikc(options.comparator);
+
+  options.table_factory = table_factory;
+  if (sanitized) {
+    options.table_properties_collectors = {
+      std::make_shared<RegularKeysStartWithA>()
+    };
+    // with sanitization, even regular properties collector will be able to
+    // handle internal keys.
+    auto comparator = options.comparator;
+    // HACK: Set options.info_log to avoid writing log in
+    // SanitizeOptions().
+    options.info_log = std::make_shared<DumbLogger>();
+    options = SanitizeOptions("db",            // just a place holder
+                              &pikc, nullptr,  // don't care filter policy
+                              options);
+    options.comparator = comparator;
+  } else {
+    options.table_properties_collectors = {
+      std::make_shared<InternalKeyPropertiesCollector>()
+    };
+  }
 
-    ASSERT_OK(builder->Finish());
+  MakeBuilder(options, pikc, &writable, &builder);
+  for (const auto& k : keys) {
+    builder->Add(k.Encode(), "val");
+  }
 
-    std::unique_ptr<TableReader> table_reader;
-    OpenTable(options, writable->contents(), &table_reader);
-    const auto& properties =
-      table_reader->GetTableProperties().user_collected_properties;
+  ASSERT_OK(builder->Finish());
+
+  FakeRandomeAccessFile readable(writable->contents());
+  TableProperties props;
+  Status s = ReadTableProperties(
+      &readable,
+      writable->contents().size(),
+      magic_number,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  ASSERT_OK(s);
 
-    uint64_t deleted = GetDeletedKeys(properties);
-    ASSERT_EQ(4u, deleted);
+  auto user_collected = props.user_collected_properties;
+  uint64_t deleted = GetDeletedKeys(user_collected);
+  ASSERT_EQ(4u, deleted);
 
-    if (sanitized) {
-      uint32_t starts_with_A = 0;
-      Slice key(properties.at("Count"));
-      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
-      ASSERT_EQ(1u, starts_with_A);
-    }
+  if (sanitized) {
+    uint32_t starts_with_A = 0;
+    Slice key(user_collected.at("Count"));
+    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+    ASSERT_EQ(1u, starts_with_A);
   }
 }
 
+TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* not sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kPlainTableMagicNumber,
+      false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(8, 8, 0)
+  );
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 50fc0dec5..87d303e25 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -78,12 +78,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
     PutVarint64(dst, last_sequence_);
   }
 
-  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
-       iter != deleted_files_.end();
-       ++iter) {
+  for (const auto& deleted : deleted_files_) {
     PutVarint32(dst, kDeletedFile);
-    PutVarint32(dst, iter->first);   // level
-    PutVarint64(dst, iter->second);  // file number
+    PutVarint32(dst, deleted.first /* level */);
+    PutVarint64(dst, deleted.second /* file number */);
   }
 
   for (size_t i = 0; i < new_files_.size(); i++) {
diff --git a/db/version_edit.h b/db/version_edit.h
index b2df9f8d3..bd5f0df95 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -12,6 +12,7 @@
 #include <utility>
 #include <vector>
 #include <string>
+#include "rocksdb/cache.h"
 #include "db/dbformat.h"
 
 namespace rocksdb {
@@ -29,8 +30,17 @@ struct FileMetaData {
   SequenceNumber smallest_seqno;// The smallest seqno in this file
   SequenceNumber largest_seqno; // The largest seqno in this file
 
-  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
-                   being_compacted(false) {}
+  // Needs to be disposed when refs becomes 0.
+  Cache::Handle* table_reader_handle;
+
+  FileMetaData(uint64_t number, uint64_t file_size)
+      : refs(0),
+        allowed_seeks(1 << 30),
+        number(number),
+        file_size(file_size),
+        being_compacted(false),
+        table_reader_handle(nullptr) {}
+  FileMetaData() : FileMetaData(0, 0) {}
 };
 
 class VersionEdit {
@@ -70,6 +80,7 @@ class VersionEdit {
                const InternalKey& largest,
                const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno) {
+    assert(smallest_seqno <= largest_seqno);
     FileMetaData f;
     f.number = file;
     f.file_size = file_size;
@@ -77,13 +88,12 @@ class VersionEdit {
     f.largest = largest;
     f.smallest_seqno = smallest_seqno;
     f.largest_seqno = largest_seqno;
-    assert(smallest_seqno <= largest_seqno);
     new_files_.push_back(std::make_pair(level, f));
   }
 
   // Delete the specified "file" from the specified "level".
   void DeleteFile(int level, uint64_t file) {
-    deleted_files_.insert(std::make_pair(level, file));
+    deleted_files_.insert({level, file});
   }
 
   // Number of edits
@@ -120,7 +130,7 @@ class VersionEdit {
  private:
   friend class VersionSet;
 
-  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+  typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 1f64171c7..228d323b7 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -14,6 +14,7 @@
 #include <set>
 #include <climits>
 #include <stdio.h>
+
 #include "db/filename.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -23,7 +24,7 @@
 #include "db/compaction.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
-#include "rocksdb/table.h"
+#include "table/table_reader.h"
 #include "table/merger.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
@@ -54,6 +55,10 @@ Version::~Version() {
       assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
+        if (f->table_reader_handle) {
+          cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
+          f->table_reader_handle = nullptr;
+        }
         vset_->obsolete_files_.push_back(f);
       }
     }
@@ -188,11 +193,10 @@ class Version::LevelFileNumIterator : public Iterator {
   mutable char value_buf_[16];
 };
 
-static Iterator* GetFileIterator(void* arg,
-                                 const ReadOptions& options,
+static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
                                  const EnvOptions& soptions,
-                                 const Slice& file_value,
-                                 bool for_compaction) {
+                                 const InternalKeyComparator& icomparator,
+                                 const Slice& file_value, bool for_compaction) {
   TableCache* cache = reinterpret_cast<TableCache*>(arg);
   if (file_value.size() != 16) {
     return NewErrorIterator(
@@ -205,12 +209,11 @@ static Iterator* GetFileIterator(void* arg,
       options_copy = options;
       options_copy.prefix = nullptr;
     }
-    return cache->NewIterator(options.prefix ? options_copy : options,
-                              soptions,
-                              DecodeFixed64(file_value.data()),
-                              DecodeFixed64(file_value.data() + 8),
-                              nullptr /* don't need reference to table*/,
-                              for_compaction);
+    FileMetaData meta(DecodeFixed64(file_value.data()),
+                      DecodeFixed64(file_value.data() + 8));
+    return cache->NewIterator(
+        options.prefix ? options_copy : options, soptions, icomparator, meta,
+        nullptr /* don't need reference to table*/, for_compaction);
   }
 }
 
@@ -230,7 +233,8 @@ bool Version::PrefixMayMatch(const ReadOptions& options,
     may_match = true;
   } else {
     may_match = cfd_->table_cache()->PrefixMayMatch(
-        options, DecodeFixed64(level_iter->value().data()),
+        options, cfd_->internal_comparator(),
+        DecodeFixed64(level_iter->value().data()),
         DecodeFixed64(level_iter->value().data() + 8), internal_prefix,
         nullptr);
   }
@@ -252,7 +256,7 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
     }
   }
   return NewTwoLevelIterator(level_iter, &GetFileIterator, cfd_->table_cache(),
-                             options, soptions);
+                             options, soptions, cfd_->internal_comparator());
 }
 
 void Version::AddIterators(const ReadOptions& options,
@@ -261,7 +265,7 @@ void Version::AddIterators(const ReadOptions& options,
   // Merge all level zero files together since they may overlap
   for (const FileMetaData* file : files_[0]) {
     iters->push_back(cfd_->table_cache()->NewIterator(
-        options, soptions, file->number, file->file_size));
+        options, soptions, cfd_->internal_comparator(), *file));
   }
 
   // For levels > 0, we can use a concatenating iterator that sequentially
@@ -311,83 +315,73 @@ static void MarkKeyMayExist(void* arg) {
   }
 }
 
-static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
+static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool didIO) {
   Saver* s = reinterpret_cast<Saver*>(arg);
   MergeContext* merge_contex = s->merge_context;
   std::string merge_result;  // temporary area for merge results later
 
   assert(s != nullptr && merge_contex != nullptr);
 
-  ParsedInternalKey parsed_key;
   // TODO: didIO and Merge?
   s->didIO = didIO;
-  if (!ParseInternalKey(ikey, &parsed_key)) {
-    // TODO: what about corrupt during Merge?
-    s->state = kCorrupt;
-  } else {
-    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
-      // Key matches. Process it
-      switch (parsed_key.type) {
-        case kTypeValue:
-          if (kNotFound == s->state) {
-            s->state = kFound;
-            s->value->assign(v.data(), v.size());
-          } else if (kMerge == s->state) {
-            assert(s->merge_operator != nullptr);
-            s->state = kFound;
-            if (!s->merge_operator->FullMerge(s->user_key, &v,
-                                              merge_contex->GetOperands(),
-                                              s->value, s->logger)) {
-              RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-              s->state = kCorrupt;
-            }
-          } else {
-            assert(false);
+  if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+    // Key matches. Process it
+    switch (parsed_key.type) {
+      case kTypeValue:
+        if (kNotFound == s->state) {
+          s->state = kFound;
+          s->value->assign(v.data(), v.size());
+        } else if (kMerge == s->state) {
+          assert(s->merge_operator != nullptr);
+          s->state = kFound;
+          if (!s->merge_operator->FullMerge(s->user_key, &v,
+                                            merge_contex->GetOperands(),
+                                            s->value, s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            s->state = kCorrupt;
           }
-          return false;
+        } else {
+          assert(false);
+        }
+        return false;
 
-        case kTypeDeletion:
-          if (kNotFound == s->state) {
-            s->state = kDeleted;
-          } else if (kMerge == s->state) {
-            s->state = kFound;
+      case kTypeDeletion:
+        if (kNotFound == s->state) {
+          s->state = kDeleted;
+        } else if (kMerge == s->state) {
+          s->state = kFound;
           if (!s->merge_operator->FullMerge(s->user_key, nullptr,
                                             merge_contex->GetOperands(),
                                             s->value, s->logger)) {
-              RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-              s->state = kCorrupt;
-            }
-          } else {
-            assert(false);
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            s->state = kCorrupt;
           }
-          return false;
-
-        case kTypeMerge:
-          assert(s->state == kNotFound || s->state == kMerge);
-          s->state = kMerge;
-          merge_contex->PushOperand(v);
-          while (merge_contex->GetNumOperands() >= 2) {
-            // Attempt to merge operands together via user associateive merge
-            if (s->merge_operator->PartialMerge(s->user_key,
-                                                merge_contex->GetOperand(0),
-                                                merge_contex->GetOperand(1),
-                                                &merge_result,
-                                                s->logger)) {
-              merge_contex->PushPartialMergeResult(merge_result);
-            } else {
-              // Associative merge returns false ==> stack the operands
-              break;
-            }
-          }
-          return true;
-
-        case kTypeColumnFamilyDeletion:
-        case kTypeColumnFamilyValue:
-        case kTypeColumnFamilyMerge:
-        case kTypeLogData:
+        } else {
           assert(false);
+        }
+        return false;
+
+      case kTypeMerge:
+        assert(s->state == kNotFound || s->state == kMerge);
+        s->state = kMerge;
+        merge_contex->PushOperand(v);
+        while (merge_contex->GetNumOperands() >= 2) {
+          // Attempt to merge operands together via user associateive merge
+          if (s->merge_operator->PartialMerge(
+                  s->user_key, merge_contex->GetOperand(0),
+                  merge_contex->GetOperand(1), &merge_result, s->logger)) {
+            merge_contex->PushPartialMergeResult(merge_result);
+          } else {
+            // Associative merge returns false ==> stack the operands
           break;
+          }
       }
+      return true;
+
+      default:
+        assert(false);
+        break;
     }
   }
 
@@ -524,8 +518,8 @@ void Version::Get(const ReadOptions& options,
       prev_file = f;
 #endif
       bool tableIO = false;
-      *status = cfd_->table_cache()->Get(options, f->number, f->file_size, ikey,
-                                         &saver, SaveValue, &tableIO,
+      *status = cfd_->table_cache()->Get(options, cfd_->internal_comparator(),
+                                         *f, ikey, &saver, SaveValue, &tableIO,
                                          MarkKeyMayExist);
       // TODO: examine the behavior for corrupted key
       if (!status->ok()) {
@@ -707,7 +701,7 @@ bool CompareSeqnoDescending(const Version::Fsize& first,
   return false;
 }
 
-}  // anonymous namespace
+} // anonymous namespace
 
 void Version::UpdateFilesBySize() {
   // No need to sort the highest level because it is never compacted.
@@ -756,12 +750,14 @@ void Version::Ref() {
   ++refs_;
 }
 
-void Version::Unref() {
+bool Version::Unref() {
   assert(refs_ >= 1);
   --refs_;
   if (refs_ == 0) {
     delete this;
+    return true;
   }
+  return false;
 }
 
 bool Version::NeedsCompaction() const {
@@ -1200,10 +1196,15 @@ class VersionSet::Builder {
         FileMetaData* f = to_unref[i];
         f->refs--;
         if (f->refs <= 0) {
+          if (f->table_reader_handle) {
+            cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
+            f->table_reader_handle = nullptr;
+          }
           delete f;
         }
       }
     }
+
     delete[] levels_;
     base_->Unref();
   }
@@ -1280,19 +1281,17 @@ class VersionSet::Builder {
 
     // Delete files
     const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
-    for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
-         iter != del.end();
-         ++iter) {
-      const int level = iter->first;
-      const uint64_t number = iter->second;
+    for (const auto& del_file : del) {
+      const auto level = del_file.first;
+      const auto number = del_file.second;
       levels_[level].deleted_files.insert(number);
       CheckConsistencyForDeletes(edit, number, level);
     }
 
     // Add new files
-    for (size_t i = 0; i < edit->new_files_.size(); i++) {
-      const int level = edit->new_files_[i].first;
-      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
+    for (const auto& new_file : edit->new_files_) {
+      const int level = new_file.first;
+      FileMetaData* f = new FileMetaData(new_file.second);
       f->refs = 1;
 
       // We arrange to automatically compact this file after
@@ -1325,23 +1324,21 @@ class VersionSet::Builder {
     for (int level = 0; level < base_->NumberLevels(); level++) {
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
-      const std::vector<FileMetaData*>& base_files = base_->files_[level];
-      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
-      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
-      const FileSet* added = levels_[level].added_files;
-      v->files_[level].reserve(base_files.size() + added->size());
-      for (FileSet::const_iterator added_iter = added->begin();
-           added_iter != added->end();
-           ++added_iter) {
+      const auto& base_files = base_->files_[level];
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      const auto& added_files = *levels_[level].added_files;
+      v->files_[level].reserve(base_files.size() + added_files.size());
+
+      for (const auto& added : added_files) {
         // Add all smaller files listed in base_
-        for (std::vector<FileMetaData*>::const_iterator bpos
-                 = std::upper_bound(base_iter, base_end, *added_iter, cmp);
+        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
              base_iter != bpos;
              ++base_iter) {
           MaybeAddFile(v, level, *base_iter);
         }
 
-        MaybeAddFile(v, level, *added_iter);
+        MaybeAddFile(v, level, added);
       }
 
       // Add remaining base files
@@ -1353,11 +1350,24 @@ class VersionSet::Builder {
     CheckConsistency(v);
   }
 
+  void LoadTableHandlers() {
+    for (int level = 0; level < cfd_->NumberLevels(); level++) {
+      for (auto& file_meta : *(levels_[level].added_files)) {
+        assert (!file_meta->table_reader_handle);
+        bool table_io;
+        cfd_->table_cache()->FindTable(
+            base_->vset_->storage_options_, cfd_->internal_comparator(),
+            file_meta->number, file_meta->file_size,
+            &file_meta->table_reader_handle, &table_io, false);
+      }
+    }
+  }
+
   void MaybeAddFile(Version* v, int level, FileMetaData* f) {
     if (levels_[level].deleted_files.count(f->number) > 0) {
       // File is deleted: do nothing
     } else {
-      std::vector<FileMetaData*>* files = &v->files_[level];
+      auto* files = &v->files_[level];
       if (level > 0 && !files->empty()) {
         // Must not overlap
         assert(cfd_->internal_comparator().Compare(
@@ -1442,13 +1452,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   ManifestWriter* last_writer = &w;
   assert(!manifest_writers_.empty());
   assert(manifest_writers_.front() == &w);
-  std::deque<ManifestWriter*>::iterator iter = manifest_writers_.begin();
-  for (; iter != manifest_writers_.end(); ++iter) {
-    if ((*iter)->cfd->GetID() != column_family_data->GetID()) {
+  for (const auto& writer : manifest_writers_) {
+    if (writer->cfd->GetID() != column_family_data->GetID()) {
       // group commits across column families are not yet supported
       break;
     }
-    last_writer = *iter;
+    last_writer = writer;
     LogAndApplyHelper(column_family_data, &builder, v, last_writer->edit, mu);
     batch_edits.push_back(last_writer->edit);
   }
@@ -1456,7 +1465,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
   // Initialize new descriptor log file if necessary by creating
   // a temporary file that contains a snapshot of the current version.
-  std::string new_manifest_file;
+  std::string new_manifest_filename;
   uint64_t new_manifest_file_size = 0;
   Status s;
   // we will need this if we are creating new manifest
@@ -1470,11 +1479,11 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   }
 
   if (new_descriptor_log) {
-    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+    new_manifest_filename = DescriptorFileName(dbname_, manifest_file_number_);
     edit->SetNextFile(next_file_number_);
   }
 
-  // Unlock during expensive MANIFEST log write. New writes cannot get here
+  // Unlock during expensive operations. New writes cannot get here
   // because &w is ensuring that all new writes get queued.
   {
     // calculate the amount of data being compacted at every level
@@ -1484,11 +1493,18 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
     mu->Unlock();
 
+    if (options_->max_open_files == -1) {
+      // unlimited table cache. Pre-load table handle now.
+      // Need to do it out of the mutex.
+      builder.LoadTableHandlers();
+    }
+
     // This is fine because everything inside of this block is serialized --
     // only one thread can be here at the same time
-    if (!new_manifest_file.empty()) {
+    if (!new_manifest_filename.empty()) {
       unique_ptr<WritableFile> descriptor_file;
-      s = env_->NewWritableFile(new_manifest_file, &descriptor_file,
+      s = env_->NewWritableFile(new_manifest_filename,
+                                &descriptor_file,
                                 storage_options_);
       if (s.ok()) {
         descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
@@ -1536,7 +1552,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
     // If we just created a new descriptor file, install it by writing a
     // new CURRENT file that points to it.
-    if (s.ok() && !new_manifest_file.empty()) {
+    if (s.ok() && !new_manifest_filename.empty()) {
       s = SetCurrentFile(env_, dbname_, manifest_file_number_);
       if (s.ok() && old_manifest_file_number < manifest_file_number_) {
         // delete old manifest file
@@ -1573,9 +1589,9 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     Log(options_->info_log, "Error in committing version %lu",
         (unsigned long)v->GetVersionNumber());
     delete v;
-    if (!new_manifest_file.empty()) {
+    if (!new_manifest_filename.empty()) {
       descriptor_log_.reset();
-      env_->DeleteFile(new_manifest_file);
+      env_->DeleteFile(new_manifest_filename);
     }
   }
 
@@ -1631,27 +1647,33 @@ Status VersionSet::Recover(
   std::set<int> column_families_not_found;
 
   // Read "CURRENT" file, which contains a pointer to the current manifest file
-  std::string current;
-  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
+  std::string manifest_filename;
+  Status s = ReadFileToString(
+      env_, CurrentFileName(dbname_), &manifest_filename
+  );
   if (!s.ok()) {
     return s;
   }
-  if (current.empty() || current[current.size()-1] != '\n') {
+  if (manifest_filename.empty() ||
+      manifest_filename.back() != '\n') {
     return Status::Corruption("CURRENT file does not end with newline");
   }
-  current.resize(current.size() - 1);
+  // remove the trailing '\n'
+  manifest_filename.resize(manifest_filename.size() - 1);
 
   Log(options_->info_log, "Recovering from manifest file:%s\n",
-      current.c_str());
+      manifest_filename.c_str());
 
-  std::string dscname = dbname_ + "/" + current;
-  unique_ptr<SequentialFile> file;
-  s = env_->NewSequentialFile(dscname, &file, storage_options_);
+  manifest_filename = dbname_ + "/" + manifest_filename;
+  unique_ptr<SequentialFile> manifest_file;
+  s = env_->NewSequentialFile(
+      manifest_filename, &manifest_file, storage_options_
+  );
   if (!s.ok()) {
     return s;
   }
   uint64_t manifest_file_size;
-  s = env_->GetFileSize(dscname, &manifest_file_size);
+  s = env_->GetFileSize(manifest_filename, &manifest_file_size);
   if (!s.ok()) {
     return s;
   }
@@ -1682,8 +1704,8 @@ Status VersionSet::Recover(
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
-    log::Reader reader(std::move(file), &reporter, true/*checksum*/,
-                       0/*initial_offset*/);
+    log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/,
+                       0 /*initial_offset*/);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -1797,7 +1819,6 @@ Status VersionSet::Recover(
       }
     }
   }
-  file.reset();
 
   if (s.ok()) {
     if (!have_next_file) {
@@ -1846,7 +1867,7 @@ Status VersionSet::Recover(
         "manifest_file_number is %lu, next_file_number is %lu, "
         "last_sequence is %lu, log_number is %lu,"
         "prev_log_number is %lu\n",
-        current.c_str(),
+        manifest_filename.c_str(),
         (unsigned long)manifest_file_number_,
         (unsigned long)next_file_number_,
         (unsigned long)last_sequence_,
@@ -2229,8 +2250,8 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         // approximate offset of "ikey" within the table.
         TableReader* table_reader_ptr;
         Iterator* iter = v->cfd_->table_cache()->NewIterator(
-            ReadOptions(), storage_options_, files[i]->number,
-            files[i]->file_size, &table_reader_ptr);
+            ReadOptions(), storage_options_, v->cfd_->internal_comparator(),
+            *(files[i]), &table_reader_ptr);
         if (table_reader_ptr != nullptr) {
           result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
         }
@@ -2285,8 +2306,9 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
       if (c->level() + which == 0) {
         for (const auto& file : *c->inputs(which)) {
           list[num++] = c->column_family_data()->table_cache()->NewIterator(
-              options, storage_options_compactions_, file->number,
-              file->file_size, nullptr, true /* for compaction */);
+              options, storage_options_compactions_,
+              c->column_family_data()->internal_comparator(), *file, nullptr,
+              true /* for compaction */);
         }
       } else {
         // Create concatenating iterator for the files from this level
@@ -2295,13 +2317,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
                 c->column_family_data()->internal_comparator(),
                 c->inputs(which)),
             &GetFileIterator, c->column_family_data()->table_cache(), options,
-            storage_options_, true /* for compaction */);
+            storage_options_, c->column_family_data()->internal_comparator(),
+            true /* for compaction */);
       }
     }
   }
   assert(num <= space);
   Iterator* result = NewMergingIterator(
-      &c->column_family_data()->internal_comparator(), list, num);
+      env_, &c->column_family_data()->internal_comparator(), list, num);
   delete[] list;
   return result;
 }
@@ -2356,14 +2379,14 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 }
 
 Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
-                                      FileMetaData* meta,
+                                      FileMetaData** meta,
                                       ColumnFamilyData** cfd) {
   for (auto cfd_iter : *column_family_set_) {
     Version* version = cfd_iter->current();
     for (int level = 0; level < version->NumberLevels(); level++) {
       for (const auto& file : version->files_[level]) {
         if (file->number == number) {
-          *meta = *file;
+          *meta = file;
           *filelevel = level;
           *cfd = cfd_iter;
           return Status::OK();
diff --git a/db/version_set.h b/db/version_set.h
index 43705bf99..e2cbd5643 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -85,8 +85,8 @@ class Version {
   };
   void Get(const ReadOptions&, const LookupKey& key, std::string* val,
            Status* status, MergeContext* merge_context,
-           GetStats* stats, const Options& db_option, bool* value_found =
-               nullptr);
+           GetStats* stats, const Options& db_option,
+           bool* value_found = nullptr);
 
   // Adds "stats" into the current state.  Returns true if a new
   // compaction may need to be triggered, false otherwise.
@@ -101,7 +101,9 @@ class Version {
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
   void Ref();
-  void Unref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
 
   // Returns true iff some level needs a compaction.
   bool NeedsCompaction() const;
@@ -384,7 +386,7 @@ class VersionSet {
   bool VerifyCompactionFileConsistency(Compaction* c);
 
   Status GetMetadataForFile(uint64_t number, int* filelevel,
-                            FileMetaData* metadata, ColumnFamilyData** cfd);
+                            FileMetaData** metadata, ColumnFamilyData** cfd);
 
   void GetLiveFilesMetaData(
     std::vector<LiveFileMetaData> *metadata);
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 1132b3551..084091aad 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -146,7 +146,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         return Status::Corruption("unknown WriteBatch tag");
     }
   }
-  if (found != WriteBatchInternal::Count(this)) {
+ if (found != WriteBatchInternal::Count(this)) {
     return Status::Corruption("WriteBatch has wrong count");
   } else {
     return Status::OK();
@@ -261,14 +261,45 @@ class MemTableInserter : public WriteBatch::Handler {
     }
     MemTable* mem = cf_mems_->GetMemTable();
     const Options* options = cf_mems_->GetFullOptions();
-    if (options->inplace_update_support &&
-        mem->Update(sequence_, kTypeValue, key, value)) {
+    if (!options->inplace_update_support) {
+      mem->Add(sequence_, kTypeValue, key, value);
+    } else if (options->inplace_callback == nullptr) {
+      mem->Update(sequence_, key, value);
       RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
     } else {
-      mem->Add(sequence_, kTypeValue, key, value);
+      if (mem->UpdateCallback(sequence_, key, value, *options)) {
+      } else {
+        // key not found in memtable. Do sst get, update, add
+        SnapshotImpl read_from_snapshot;
+        read_from_snapshot.number_ = sequence_;
+        ReadOptions ropts;
+        ropts.snapshot = &read_from_snapshot;
+
+        std::string prev_value;
+        std::string merged_value;
+        Status s = db_->Get(ropts, key, &prev_value);
+        char* prev_buffer = const_cast<char*>(prev_value.c_str());
+        uint32_t prev_size = prev_value.size();
+        auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
+                                                s.ok() ? &prev_size : nullptr,
+                                                value, &merged_value);
+        if (status == UpdateStatus::UPDATED_INPLACE) {
+          // prev_value is updated in-place with final value.
+          mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
+          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+        } else if (status == UpdateStatus::UPDATED) {
+          // merged_value contains the final value.
+          mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
+          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+        }
+      }
     }
+    // Since all Puts are logged in trasaction logs (if enabled), always bump
+    // sequence number. Even if the update eventually fails and does not result
+    // in memtable add/update.
     sequence_++;
   }
+
   virtual void MergeCF(uint32_t column_family_id, const Slice& key,
                        const Slice& value) {
     bool found = cf_mems_->Seek(column_family_id);
@@ -333,6 +364,7 @@ class MemTableInserter : public WriteBatch::Handler {
 
     sequence_++;
   }
+
   virtual void DeleteCF(uint32_t column_family_id, const Slice& key) {
     bool found = cf_mems_->Seek(column_family_id);
     if (!found || IgnoreUpdate()) {
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index a2dee2959..d56d7107a 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -58,10 +58,7 @@ static std::string PrintContents(WriteBatch* b) {
         state.append(")");
         count++;
         break;
-      case kTypeColumnFamilyDeletion:
-      case kTypeColumnFamilyValue:
-      case kTypeColumnFamilyMerge:
-      case kTypeLogData:
+      default:
         assert(false);
         break;
     }
diff --git a/include/rocksdb/arena.h b/include/rocksdb/arena.h
deleted file mode 100644
index 642b61408..000000000
--- a/include/rocksdb/arena.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// Arena class defines memory allocation methods. It's used by memtable and
-// skiplist.
-
-#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
-#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
-
-#include <limits>
-#include <memory>
-
-namespace rocksdb {
-
-class Arena {
- public:
-  Arena() {};
-  virtual ~Arena() {};
-
-  // Return a pointer to a newly allocated memory block of "bytes" bytes.
-  virtual char* Allocate(size_t bytes) = 0;
-
-  // Allocate memory with the normal alignment guarantees provided by malloc.
-  virtual char* AllocateAligned(size_t bytes) = 0;
-
-  // Returns an estimate of the total memory used by arena.
-  virtual const size_t ApproximateMemoryUsage() = 0;
-
-  // Returns the total number of bytes in all blocks allocated so far.
-  virtual const size_t MemoryAllocatedBytes() = 0;
-
- private:
-  // No copying allowed
-  Arena(const Arena&);
-  void operator=(const Arena&);
-};
-
-}  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 7d58e1546..b5821bac2 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -102,7 +102,10 @@ class Cache {
   virtual uint64_t NewId() = 0;
 
   // returns the maximum configured capacity of the cache
-  virtual size_t GetCapacity() = 0;
+  virtual size_t GetCapacity() const = 0;
+
+  // returns the memory size for the entries residing in the cache.
+  virtual size_t GetUsage() const = 0;
 
   // Call this on shutdown if you want to speed it up. Cache will disown
   // any underlying data and will not free it on delete. This call will leak
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 484582c90..48a4d33d4 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -438,7 +438,7 @@ class WritableFile {
   // This asks the OS to initiate flushing the cached data to disk,
   // without waiting for completion.
   // Default implementation does nothing.
-  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
+  virtual Status RangeSync(off_t offset, off_t nbytes) {
     return Status::OK();
   }
 
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 9e24942ac..e9a41aedd 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -33,8 +33,7 @@
 // iteration over the entire collection is rare since doing so requires all the
 // keys to be copied into a sorted data structure.
 
-#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
-#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
+#pragma once
 
 #include <memory>
 
@@ -52,7 +51,11 @@ class MemTableRep {
    public:
     // Compare a and b. Return a negative value if a is less than b, 0 if they
     // are equal, and a positive value if a is greater than b
-    virtual int operator()(const char* a, const char* b) const = 0;
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const = 0;
+
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const = 0;
 
     virtual ~KeyComparator() { }
   };
@@ -100,7 +103,7 @@ class MemTableRep {
     virtual void Prev() = 0;
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) = 0;
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
@@ -175,26 +178,22 @@ public:
   }
 };
 
-// HashSkipListRep is backed by hash map of buckets. Each bucket is a skip
-// list. All the keys with the same prefix will be in the same bucket.
-// The prefix is determined using user supplied SliceTransform. It has
-// to match prefix_extractor in options.prefix_extractor.
-//
-// Iteration over the entire collection is implemented by dumping all the keys
-// into a separate skip list. Thus, these data structures are best used when
-// iteration over the entire collection is rare.
-//
-// Parameters:
-//   transform: The prefix extractor that returns prefix when supplied a user
-//     key. Has to match options.prefix_extractor
-//   bucket_count: Number of buckets in a hash_map. Each bucket needs
-//     8 bytes. By default, we set buckets to one million, which
-//     will take 8MB of memory. If you know the number of keys you'll
-//     keep in hash map, set bucket count to be approximately twice
-//     the number of keys
+// This class contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+//                            link lists in the skiplist
 extern MemTableRepFactory* NewHashSkipListRepFactory(
-    const SliceTransform* transform, size_t bucket_count = 1000000);
+  const SliceTransform* transform, size_t bucket_count = 1000000,
+  int32_t skiplist_height = 4, int32_t skiplist_branching_factor = 4
+);
 
-}
+// The factory is to create memtables with a hashed linked list:
+// it contains a fixed array of buckets, each pointing to a sorted single
+// linked list (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+    const SliceTransform* transform, size_t bucket_count = 50000);
 
-#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
+}
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 47ee930e8..4623543fd 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -34,6 +34,7 @@ class TablePropertiesCollector;
 class Slice;
 class SliceTransform;
 class Statistics;
+class InternalKeyComparator;
 
 using std::shared_ptr;
 
@@ -65,6 +66,12 @@ struct CompressionOptions {
       : window_bits(wbits), level(lev), strategy(strategy) {}
 };
 
+enum UpdateStatus {    // Return status For inplace update callback
+  UPDATE_FAILED   = 0, // Nothing to update
+  UPDATED_INPLACE = 1, // Value updated inplace
+  UPDATED         = 2, // No inplace update. Merged value set
+};
+
 struct Options;
 
 struct ColumnFamilyOptions {
@@ -410,13 +417,17 @@ struct ColumnFamilyOptions {
   // the tables.
   // Default: emtpy vector -- no user-defined statistics collection will be
   // performed.
-  std::vector<std::shared_ptr<TablePropertiesCollector>>
-    table_properties_collectors;
-
-  // Allows thread-safe inplace updates. Requires Updates iff
-  // * key exists in current memtable
-  // * new sizeof(new_value) <= sizeof(old_value)
-  // * old_value for that key is a put i.e. kTypeValue
+  typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
+      TablePropertiesCollectors;
+  TablePropertiesCollectors table_properties_collectors;
+
+  // Allows thread-safe inplace updates.
+  // If inplace_callback function is not set,
+  //   Put(key, new_value) will update inplace the existing_value iff
+  //   * key exists in current memtable
+  //   * new sizeof(new_value) <= sizeof(existing_value)
+  //   * existing_value for that key is a put i.e. kTypeValue
+  // If inplace_callback function is set, check doc for inplace_callback.
   // Default: false.
   bool inplace_update_support;
 
@@ -424,6 +435,55 @@ struct ColumnFamilyOptions {
   // Default: 10000, if inplace_update_support = true, else 0.
   size_t inplace_update_num_locks;
 
+  // existing_value - pointer to previous value (from both memtable and sst).
+  //                  nullptr if key doesn't exist
+  // existing_value_size - pointer to size of existing_value).
+  //                       nullptr if key doesn't exist
+  // delta_value - Delta value to be merged with the existing_value.
+  //               Stored in transaction logs.
+  // merged_value - Set when delta is applied on the previous value.
+
+  // Applicable only when inplace_update_support is true,
+  // this callback function is called at the time of updating the memtable
+  // as part of a Put operation, lets say Put(key, delta_value). It allows the
+  // 'delta_value' specified as part of the Put operation to be merged with
+  // an 'existing_value' of the key in the database.
+
+  // If the merged value is smaller in size that the 'existing_value',
+  // then this function can update the 'existing_value' buffer inplace and
+  // the corresponding 'existing_value'_size pointer, if it wishes to.
+  // The callback should return UpdateStatus::UPDATED_INPLACE.
+  // In this case. (In this case, the snapshot-semantics of the rocksdb
+  // Iterator is not atomic anymore).
+
+  // If the merged value is larger in size than the 'existing_value' or the
+  // application does not wish to modify the 'existing_value' buffer inplace,
+  // then the merged value should be returned via *merge_value. It is set by
+  // merging the 'existing_value' and the Put 'delta_value'. The callback should
+  // return UpdateStatus::UPDATED in this case. This merged value will be added
+  // to the memtable.
+
+  // If merging fails or the application does not wish to take any action,
+  // then the callback should return UpdateStatus::UPDATE_FAILED.
+
+  // Please remember that the original call from the application is Put(key,
+  // delta_value). So the transaction log (if enabled) will still contain (key,
+  // delta_value). The 'merged_value' is not stored in the transaction log.
+  // Hence the inplace_callback function should be consistent across db reopens.
+
+  // Default: nullptr
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+
+  // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
+  // for memtable
+  uint32_t memtable_prefix_bloom_bits;
+
+  // number of hash probes per key
+  uint32_t memtable_prefix_bloom_probes;
+
   // Maximum number of successive merge operations on a key in the memtable.
   //
   // When a merge operation is added to the memtable and the maximum number of
@@ -473,9 +533,10 @@ struct DBOptions {
   shared_ptr<Logger> info_log;
 
   // Number of open files that can be used by the DB.  You may need to
-  // increase this if your database has a large working set (budget
-  // one open file per 2MB of working set).
-  //
+  // increase this if your database has a large working set. Value -1 means
+  // files opened are always kept open. You can estimate number of files based
+  // on target_file_size_base and target_file_size_multiplier for level-based
+  // compaction. For universal-style compaction, you can usually set it to -1.
   // Default: 1000
   int max_open_files;
 
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
index 9e900e050..551ca8fe6 100644
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -38,7 +38,27 @@ struct PerfContext {
   uint64_t internal_key_skipped_count;
   // total number of deletes skipped over during iteration
   uint64_t internal_delete_skipped_count;
-  uint64_t wal_write_time;            // total time spent on writing to WAL
+
+  uint64_t get_snapshot_time;          // total time spent on getting snapshot
+  uint64_t get_from_memtable_time;     // total time spent on querying memtables
+  uint64_t get_from_memtable_count;    // number of mem tables queried
+  // total time spent after Get() finds a key
+  uint64_t get_post_process_time;
+  uint64_t get_from_output_files_time; // total time reading from output files
+  // total time spent on seeking child iters
+  uint64_t seek_child_seek_time;
+  // number of seek issued in child iterators
+  uint64_t seek_child_seek_count;
+  uint64_t seek_min_heap_time;         // total time spent on the merge heap
+  // total time spent on seeking the internal entries
+  uint64_t seek_internal_seek_time;
+  // total time spent on iterating internal entries to find the next user entry
+  uint64_t find_next_user_entry_time;
+  // total time spent on pre or post processing when writing a record
+  uint64_t write_pre_and_post_process_time;
+  uint64_t write_wal_time;            // total time spent on writing to WAL
+  // total time spent on writing to mem tables
+  uint64_t write_memtable_time;
 };
 
 extern __thread PerfContext perf_context;
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index f5fbb5924..cddd74bf8 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -7,7 +7,6 @@
 #define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
 
 #include <atomic>
-#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -18,10 +17,8 @@ namespace rocksdb {
 
 /**
  * Keep adding ticker's here.
- * Any ticker should have a value less than TICKER_ENUM_MAX.
- * Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
- * Add a string representation in TickersNameMap below.
- * And incrementing TICKER_ENUM_MAX.
+ *  1. Any ticker should be added before TICKER_ENUM_MAX.
+ *  2. Add a readable string in TickersNameMap below for the newly added ticker.
  */
 enum Tickers {
   // total block cache misses
@@ -252,7 +249,7 @@ class Statistics {
   virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
   virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
 
-  virtual void histogramData(Histograms type, HistogramData * const data) = 0;
+  virtual void histogramData(Histograms type, HistogramData* const data) = 0;
   // String representation of the statistic object.
   std::string ToString();
 };
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 2d2bfacc4..d4965ca45 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -1,127 +1,81 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+//   1. Block-based table: this is the default table type that we inherited from
+//      LevelDB, which was designed for storing data in hard disk or flash
+//      device.
+//   2. Plain table: it is one of RocksDB's SST file format optimized
+//      for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
 
 #pragma once
 #include <memory>
-#include <stdint.h>
+#include <string>
+#include <unordered_map>
+
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
-#include "rocksdb/table_properties.h"
 #include "rocksdb/options.h"
+#include "rocksdb/status.h"
 
 namespace rocksdb {
 
-struct Options;
+// -- Block-based Table
+class FlushBlockPolicyFactory;
 class RandomAccessFile;
-struct ReadOptions;
-class TableCache;
+class TableBuilder;
+class TableReader;
 class WritableFile;
+struct EnvOptions;
+struct Options;
 
 using std::unique_ptr;
 
-// TableBuilder provides the interface used to build a Table
-// (an immutable and sorted map from keys to values).
-//
-// Multiple threads can invoke const methods on a TableBuilder without
-// external synchronization, but if any of the threads may call a
-// non-const method, all threads accessing the same TableBuilder must use
-// external synchronization.
-class TableBuilder {
- public:
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  virtual ~TableBuilder() {}
-
-  // Add key,value to the table being constructed.
-  // REQUIRES: key is after any previously added key according to comparator.
-  // REQUIRES: Finish(), Abandon() have not been called
-  virtual void Add(const Slice& key, const Slice& value) = 0;
-
-  // Return non-ok iff some error has been detected.
-  virtual Status status() const = 0;
-
-  // Finish building the table.
-  // REQUIRES: Finish(), Abandon() have not been called
-  virtual Status Finish() = 0;
-
-  // Indicate that the contents of this builder should be abandoned.
-  // If the caller is not going to call Finish(), it must call Abandon()
-  // before destroying this builder.
-  // REQUIRES: Finish(), Abandon() have not been called
-  virtual void Abandon() = 0;
-
-  // Number of calls to Add() so far.
-  virtual uint64_t NumEntries() const = 0;
-
-  // Size of the file generated so far.  If invoked after a successful
-  // Finish() call, returns the size of the final generated file.
-  virtual uint64_t FileSize() const = 0;
-};
-
-// A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.  A Table may be safely accessed from
-// multiple threads without external synchronization.
-class TableReader {
- public:
-  virtual ~TableReader() {}
-
-  // Determine whether there is a chance that the current table file
-  // contains the key a key starting with iternal_prefix. The specific
-  // table implementation can use bloom filter and/or other heuristic
-  // to filter out this table as a whole.
-  virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
-
-  // Returns a new iterator over the table contents.
-  // The result of NewIterator() is initially invalid (caller must
-  // call one of the Seek methods on the iterator before using it).
-  virtual Iterator* NewIterator(const ReadOptions&) = 0;
-
-  // Given a key, return an approximate byte offset in the file where
-  // the data for that key begins (or would begin if the key were
-  // present in the file).  The returned value is in terms of file
-  // bytes, and so includes effects like compression of the underlying data.
-  // E.g., the approximate offset of the last key in the table will
-  // be close to the file length.
-  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
-
-  // Returns true if the block for the specified key is in cache.
-  // REQUIRES: key is in this table.
-  virtual bool TEST_KeyInCache(const ReadOptions& options,
-                               const Slice& key) = 0;
-
-  // Set up the table for Compaction. Might change some parameters with
-  // posix_fadvise
-  virtual void SetupForCompaction() = 0;
-
-  virtual TableProperties& GetTableProperties() = 0;
-
-  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
-  // the entry found after a call to Seek(key), until result_handler returns
-  // false, where k is the actual internal key for a row found and v as the
-  // value of the key. didIO is true if I/O is involved in the operation. May
-  // not make such a call if filter policy says that key is not present.
-  //
-  // mark_key_may_exist_handler needs to be called when it is configured to be
-  // memory only and the key is not found in the block cache, with
-  // the parameter to be handle_context.
+// For advanced user only
+struct BlockBasedTableOptions {
+  // @flush_block_policy_factory creates the instances of flush block policy.
+  // which provides a configurable way to determine when to flush a block in
+  // the block based tables.  If not set, table builder will use the default
+  // block flush policy, which cut blocks by block size (please refer to
+  // `FlushBlockBySizePolicy`).
+  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+  // TODO(kailiu) Temporarily disable this feature by making the default value
+  // to be false.
   //
-  // readOptions is the options for the read
-  // key is the key to search for
-  virtual Status Get(
-      const ReadOptions& readOptions,
-      const Slice& key,
-      void* handle_context,
-      bool (*result_handler)(void* handle_context, const Slice& k,
-                             const Slice& v, bool didIO),
-      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+  // Indicating if we'd put index/filter blocks to the block cache.
+  // If not specified, each "table reader" object will pre-load index/filter
+  // block during table initialization.
+  bool cache_index_and_filter_blocks = false;
 };
 
-// A base class for table factories
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+// -- Plain Table
+// @user_key_len: plain table has optimization for fix-sized keys, which can be
+//                specified via user_key_len.  Alternatively, you can pass
+//                `kPlainTableVariableLength` if your keys have variable
+//                lengths.
+// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may
+//                  disable it by passing a zero.
+// @hash_table_ratio: the desired utilization of the hash table used for prefix
+//                    hashing. hash_table_ratio = number of prefixes / #buckets
+//                    in the hash table
+const uint32_t kPlainTableVariableLength = 0;
+extern TableFactory* NewPlainTableFactory(
+    uint32_t user_key_len = kPlainTableVariableLength,
+    int bloom_bits_per_key = 10, double hash_table_ratio = 0.75);
+
+// A base class for table factories.
 class TableFactory {
  public:
   virtual ~TableFactory() {}
@@ -139,7 +93,7 @@ class TableFactory {
   // in parameter file. It's the caller's responsibility to make sure
   // file is in the correct format.
   //
-  // GetTableReader() is called in two places:
+  // NewTableReader() is called in two places:
   // (1) TableCache::FindTable() calls the function when table cache miss
   //     and cache the table object returned.
   // (1) SstFileReader (for SST Dump) opens the table and dump the table
@@ -150,9 +104,10 @@ class TableFactory {
   // file is a file handler to handle the file for the table
   // file_size is the physical file size of the file
   // table_reader is the output table reader
-  virtual Status GetTableReader(
+  virtual Status NewTableReader(
       const Options& options, const EnvOptions& soptions,
-      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
       unique_ptr<TableReader>* table_reader) const = 0;
 
   // Return a table builder to write to a file for this table type.
@@ -173,8 +128,9 @@ class TableFactory {
   // file is a handle of a writable file. It is the caller's responsibility to
   // keep the file open and close the file after closing the table builder.
   // compression_type is the compression type to use in this table.
-  virtual TableBuilder* GetTableBuilder(
-      const Options& options, WritableFile* file,
-      CompressionType compression_type) const = 0;
+  virtual TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const = 0;
 };
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 8824ca13c..1d4b9e344 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -1,28 +1,25 @@
-// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 
 #include <string>
 #include <unordered_map>
-
 #include "rocksdb/status.h"
 
 namespace rocksdb {
 
+// -- Table Properties
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interprete these values by themselves.
+typedef std::unordered_map<std::string, std::string> UserCollectedProperties;
+
 // TableProperties contains a bunch of read-only properties of its associated
 // table.
 struct TableProperties {
  public:
-  // Other than basic table properties, each table may also have the user
-  // collected properties.
-  // The value of the user-collected properties are encoded as raw bytes --
-  // users have to interprete these values by themselves.
-  typedef
-    std::unordered_map<std::string, std::string>
-    UserCollectedProperties;
-
   // the total size of all data blocks.
   uint64_t data_size = 0;
   // the size of index block.
@@ -37,6 +34,10 @@ struct TableProperties {
   uint64_t num_data_blocks = 0;
   // the number of entries in this table
   uint64_t num_entries = 0;
+  // format version, reserved for backward compatibility
+  uint64_t format_version = 0;
+  // If 0, key is variable length. Otherwise number of bytes for each key.
+  uint64_t fixed_key_len = 0;
 
   // The name of the filter policy used in this table.
   // If no filter policy is used, `filter_policy_name` will be an empty string.
@@ -47,17 +48,32 @@ struct TableProperties {
 
   // convert this object to a human readable form
   //   @prop_delim: delimiter for each property.
-  std::string ToString(
-      const std::string& prop_delim = "; ",
-      const std::string& kv_delim = "=") const;
+  std::string ToString(const std::string& prop_delim = "; ",
+                       const std::string& kv_delim = "=") const;
 };
 
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+  static const std::string kDataSize;
+  static const std::string kIndexSize;
+  static const std::string kFilterSize;
+  static const std::string kRawKeySize;
+  static const std::string kRawValueSize;
+  static const std::string kNumDataBlocks;
+  static const std::string kNumEntries;
+  static const std::string kFormatVersion;
+  static const std::string kFixedKeyLen;
+  static const std::string kFilterPolicy;
+};
+
+extern const std::string kPropertiesBlock;
+
 // `TablePropertiesCollector` provides the mechanism for users to collect
 // their own interested properties. This class is essentially a collection
 //  of callback functions that will be invoked during table building.
 class TablePropertiesCollector {
  public:
-  virtual ~TablePropertiesCollector() { }
+  virtual ~TablePropertiesCollector() {}
 
   // Add() will be called when a new key/value pair is inserted into the table.
   // @params key    the original key that is inserted into the table.
@@ -68,23 +84,20 @@ class TablePropertiesCollector {
   // for writing the properties block.
   // @params properties  User will add their collected statistics to
   // `properties`.
-  virtual Status Finish(
-      TableProperties::UserCollectedProperties* properties) = 0;
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
 
   // The name of the properties collector can be used for debugging purpose.
   virtual const char* Name() const = 0;
 
   // Return the human-readable properties, where the key is property name and
   // the value is the human-readable form of value.
-  virtual TableProperties::UserCollectedProperties
-    GetReadableProperties() const = 0;
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
 };
 
 // Extra properties
 // Below is a list of non-basic properties that are collected by database
 // itself. Especially some properties regarding to the internal keys (which
 // is unknown to `table`).
-extern uint64_t GetDeletedKeys(
-    const TableProperties::UserCollectedProperties& props);
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
 
 }  // namespace rocksdb
diff --git a/port/port_posix.h b/port/port_posix.h
index 15ab0dc5b..8ff2480a3 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -396,7 +396,6 @@ inline char*  BZip2_Uncompress(const char* input_data, size_t input_length,
         _stream.next_out = (char *)(output + old_sz);
         _stream.avail_out = output_len - old_sz;
         break;
-      case Z_BUF_ERROR:
       default:
         delete[] output;
         BZ2_bzDecompressEnd(&_stream);
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index a5e546be8..e5f3bd4d2 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -17,15 +17,17 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
-#include "rocksdb/table.h"
+#include "table/table_builder.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
+#include "db/dbformat.h"
 #include "table/block_based_table_reader.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
@@ -34,51 +36,24 @@ namespace rocksdb {
 
 namespace {
 
-struct BytewiseLessThan {
-  bool operator()(const std::string& key1, const std::string& key2) const {
-    // smaller entries will be placed in front.
-    return comparator->Compare(key1, key2) <= 0;
-  }
-  const Comparator* comparator = BytewiseComparator();
-};
-
-// When writing to a block that requires entries to be sorted by
-// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
-// before writng to store.
-typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
-
-void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) {
-  assert(props.find(name) == props.end());
-
-  std::string dst;
-  PutVarint64(&dst, val);
-
-  props.insert(
-      std::make_pair(name, dst)
-  );
-}
-
 static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
   // Check to see if compressed less than 12.5%
   return compressed_size < raw_size - (raw_size / 8u);
 }
 
-// Were we encounter any error occurs during user-defined statistics collection,
-// we'll write the warning message to info log.
-void LogPropertiesCollectionError(
-    Logger* info_log, const std::string& method, const std::string& name) {
-  assert(method == "Add" || method == "Finish");
-
-  std::string msg =
-    "[Warning] encountered error when calling TablePropertiesCollector::" +
-    method + "() with collector name: " + name;
-  Log(info_log, "%s", msg.c_str());
-}
-
 }  // anonymous namespace
 
+// kBlockBasedTableMagicNumber was picked by running
+//    echo http://code.google.com/p/leveldb/ | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockBasedTableMagicNumber may also be accessed by
+// other .cc files so it have to be explicitly declared with "extern".
+extern const uint64_t kBlockBasedTableMagicNumber
+    = 0xdb4775248b80fb57ull;
+
 struct BlockBasedTableBuilder::Rep {
   Options options;
+  const InternalKeyComparator& internal_comparator;
   WritableFile* file;
   uint64_t offset = 0;
   Status status;
@@ -98,31 +73,30 @@ struct BlockBasedTableBuilder::Rep {
   std::string compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
 
-  Rep(const Options& opt,
-      WritableFile* f,
-      FlushBlockPolicyFactory* flush_block_policy_factory,
+  Rep(const Options& opt, const InternalKeyComparator& icomparator,
+      WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
       CompressionType compression_type)
       : options(opt),
+        internal_comparator(icomparator),
         file(f),
-        data_block(options),
+        data_block(options, &internal_comparator),
         // To avoid linear scan, we make the block_restart_interval to be `1`
         // in index block builder
-        index_block(1 /* block_restart_interval */, options.comparator),
+        index_block(1 /* block_restart_interval */, &internal_comparator),
         compression_type(compression_type),
-        filter_block(opt.filter_policy == nullptr ? nullptr
-                     : new FilterBlockBuilder(opt)),
+        filter_block(opt.filter_policy == nullptr
+                         ? nullptr
+                         : new FilterBlockBuilder(opt, &internal_comparator)),
         flush_block_policy(
-            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {
-  }
+            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {}
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const Options& options,
-    WritableFile* file,
-    FlushBlockPolicyFactory* flush_block_policy_factory,
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory,
     CompressionType compression_type)
-    : rep_(new Rep(options,
-                   file, flush_block_policy_factory, compression_type)) {
+    : rep_(new Rep(options, internal_comparator, file,
+                   flush_block_policy_factory, compression_type)) {
   if (rep_->filter_block != nullptr) {
     rep_->filter_block->StartBlock(0);
   }
@@ -145,7 +119,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
   assert(!r->closed);
   if (!ok()) return;
   if (r->props.num_entries > 0) {
-    assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
+    assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
   }
 
   auto should_flush = r->flush_block_policy->Update(key, value);
@@ -162,7 +136,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     // entries in the first block and < all entries in subsequent
     // blocks.
     if (ok()) {
-      r->options.comparator->FindShortestSeparator(&r->last_key, key);
+      r->internal_comparator.FindShortestSeparator(&r->last_key, key);
       std::string handle_encoding;
       r->pending_handle.EncodeTo(&handle_encoding);
       r->index_block.Add(r->last_key, Slice(handle_encoding));
@@ -179,16 +153,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
   r->props.raw_key_size += key.size();
   r->props.raw_value_size += value.size();
 
-  for (auto collector : r->options.table_properties_collectors) {
-    Status s = collector->Add(key, value);
-    if (!s.ok()) {
-      LogPropertiesCollectionError(
-          r->options.info_log.get(),
-          "Add", /* method */
-          collector->Name()
-      );
-    }
-  }
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      r->options.table_properties_collectors,
+      r->options.info_log.get()
+  );
 }
 
 void BlockBasedTableBuilder::Flush() {
@@ -370,7 +340,7 @@ Status BlockBasedTableBuilder::Finish() {
   // block, we will finish writing all index entries here and flush them
   // to storage after metaindex block is written.
   if (ok() && !empty_data_block) {
-    r->options.comparator->FindShortSuccessor(&r->last_key);
+    r->internal_comparator.FindShortSuccessor(&r->last_key);
 
     std::string handle_encoding;
     r->pending_handle.EncodeTo(&handle_encoding);
@@ -382,14 +352,7 @@ Status BlockBasedTableBuilder::Finish() {
   //    2. [meta block: properties]
   //    3. [metaindex block]
   if (ok()) {
-    // We use `BytewiseComparator` as the comparator for meta block.
-    BlockBuilder meta_index_block(
-        r->options.block_restart_interval,
-        BytewiseComparator()
-    );
-    // Key: meta block name
-    // Value: block handle to that meta block
-    BytewiseSortedMap meta_block_handles;
+    MetaIndexBuilder meta_index_builer;
 
     // Write filter block.
     if (r->filter_block != nullptr) {
@@ -397,104 +360,43 @@ Status BlockBasedTableBuilder::Finish() {
       // of filter data.
       std::string key = BlockBasedTable::kFilterBlockPrefix;
       key.append(r->options.filter_policy->Name());
-      std::string handle_encoding;
-      filter_block_handle.EncodeTo(&handle_encoding);
-      meta_block_handles.insert(
-          std::make_pair(key, handle_encoding)
-      );
+      meta_index_builer.Add(key, filter_block_handle);
     }
 
     // Write properties block.
     {
-      BlockBuilder properties_block(
-          r->options.block_restart_interval,
-          BytewiseComparator()
-      );
-
-      BytewiseSortedMap properties;
-
-      // Add basic properties
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kRawKeySize,
-          r->props.raw_key_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kRawValueSize,
-          r->props.raw_value_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kDataSize,
-          r->props.data_size
-      );
+      PropertyBlockBuilder property_block_builder;
+      std::vector<std::string> failed_user_prop_collectors;
+      r->props.filter_policy_name = r->options.filter_policy != nullptr ?
+          r->options.filter_policy->Name() : "";
       r->props.index_size =
         r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kIndexSize,
-          r->props.index_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kNumEntries,
-          r->props.num_entries
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kNumDataBlocks,
-          r->props.num_data_blocks);
-      if (r->filter_block != nullptr) {
-        properties.insert({
-              BlockBasedTablePropertiesNames::kFilterPolicy,
-              r->options.filter_policy->Name()
-        });
-      }
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kFilterSize,
-          r->props.filter_size
-      );
 
-      for (auto collector : r->options.table_properties_collectors) {
-        TableProperties::UserCollectedProperties user_collected_properties;
-        Status s =
-          collector->Finish(&user_collected_properties);
-
-        if (!s.ok()) {
-          LogPropertiesCollectionError(
-              r->options.info_log.get(),
-              "Finish", /* method */
-              collector->Name()
-          );
-        } else {
-          properties.insert(
-              user_collected_properties.begin(),
-              user_collected_properties.end()
-          );
-        }
-      }
+      // Add basic properties
+      property_block_builder.AddTableProperty(r->props);
 
-      for (const auto& stat : properties) {
-        properties_block.Add(stat.first, stat.second);
-      }
+      NotifyCollectTableCollectorsOnFinish(
+          r->options.table_properties_collectors,
+          r->options.info_log.get(),
+          &property_block_builder
+      );
 
       BlockHandle properties_block_handle;
-      WriteBlock(&properties_block, &properties_block_handle);
-
-      std::string handle_encoding;
-      properties_block_handle.EncodeTo(&handle_encoding);
-      meta_block_handles.insert(
-          { BlockBasedTable::kPropertiesBlock, handle_encoding }
+      WriteRawBlock(
+          property_block_builder.Finish(),
+          kNoCompression,
+          &properties_block_handle
       );
-    }  // end of properties block writing
 
-    for (const auto& metablock : meta_block_handles) {
-      meta_index_block.Add(metablock.first, metablock.second);
-    }
+      meta_index_builer.Add(kPropertiesBlock,
+                            properties_block_handle);
+    }  // end of properties block writing
 
-    WriteBlock(&meta_index_block, &metaindex_block_handle);
+    WriteRawBlock(
+        meta_index_builer.Finish(),
+        kNoCompression,
+        &metaindex_block_handle
+    );
   }  // meta blocks and metaindex block.
 
   // Write index block
@@ -504,7 +406,7 @@ Status BlockBasedTableBuilder::Finish() {
 
   // Write footer
   if (ok()) {
-    Footer footer;
+    Footer footer(kBlockBasedTableMagicNumber);
     footer.set_metaindex_handle(metaindex_block_handle);
     footer.set_index_handle(index_block_handle);
     std::string footer_encoding;
@@ -556,4 +458,7 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
   return rep_->offset;
 }
 
+const std::string BlockBasedTable::kFilterBlockPrefix =
+    "filter.";
+
 }  // namespace rocksdb
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
index 517f8e785..1c4be1f83 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@@ -12,7 +12,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
-#include "rocksdb/table.h"
+#include "table/table_builder.h"
 
 namespace rocksdb {
 
@@ -20,13 +20,13 @@ class BlockBuilder;
 class BlockHandle;
 class WritableFile;
 
-
 class BlockBasedTableBuilder : public TableBuilder {
  public:
   // Create a builder that will store the contents of the table it is
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish().
   BlockBasedTableBuilder(const Options& options,
+                         const InternalKeyComparator& internal_comparator,
                          WritableFile* file,
                          FlushBlockPolicyFactory* flush_block_policy_factory,
                          CompressionType compression_type);
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index a9cd35a68..6a4a64462 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -18,17 +18,19 @@
 
 namespace rocksdb {
 
-Status BlockBasedTableFactory::GetTableReader(
+Status BlockBasedTableFactory::NewTableReader(
     const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
   return BlockBasedTable::Open(options, soptions, table_options_,
-                               std::move(file), file_size, table_reader);
+                               internal_comparator, std::move(file), file_size,
+                               table_reader);
 }
 
-TableBuilder* BlockBasedTableFactory::GetTableBuilder(
-    const Options& options, WritableFile* file,
-    CompressionType compression_type) const {
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
   auto flush_block_policy_factory = 
     table_options_.flush_block_policy_factory.get();
 
@@ -45,11 +47,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
                                           options.block_size_deviation);
   }
 
-  auto table_builder =  new BlockBasedTableBuilder(
-      options,
-      file,
-      flush_block_policy_factory,
-      compression_type);
+  auto table_builder =
+      new BlockBasedTableBuilder(options, internal_comparator, file,
+                                 flush_block_policy_factory, compression_type);
 
   // Delete flush_block_policy_factory only when it's just created from the
   // options.
@@ -63,4 +63,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
   return table_builder;
 }
 
+TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options) {
+  return new BlockBasedTableFactory(table_options);
+}
+
 }  // namespace rocksdb
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index 5a4d1bd6e..556997065 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -14,7 +14,6 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_options.h"
 
 namespace rocksdb {
 
@@ -22,31 +21,26 @@ struct Options;
 struct EnvOptions;
 
 using std::unique_ptr;
-class Status;
-class RandomAccessFile;
-class WritableFile;
-class Table;
-class TableBuilder;
-class BlockBasedTable;
 class BlockBasedTableBuilder;
 
-class BlockBasedTableFactory: public TableFactory {
+class BlockBasedTableFactory : public TableFactory {
  public:
-  BlockBasedTableFactory() : BlockBasedTableFactory(BlockBasedTableOptions()) {}
-  explicit BlockBasedTableFactory(const BlockBasedTableOptions& table_options)
+  explicit BlockBasedTableFactory(
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions())
       : table_options_(table_options) {}
 
   ~BlockBasedTableFactory() {}
 
   const char* Name() const override { return "BlockBasedTable"; }
 
-  Status GetTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const override;
 
-  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
-                                CompressionType compression_type)
-      const override;
+  TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const override;
 
  private:
   BlockBasedTableOptions table_options_;
diff --git a/table/block_based_table_options.h b/table/block_based_table_options.h
deleted file mode 100644
index f5774e2bf..000000000
--- a/table/block_based_table_options.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#pragma once
-#include <memory>
-
-namespace rocksdb {
-
-class FlushBlockPolicyFactory;
-
-struct BlockBasedTableOptions {
-  // @flush_block_policy_factory creates the instances of flush block policy.
-  // which provides a configurable way to determine when to flush a block in
-  // the block based tables.  If not set, table builder will use the default
-  // block flush policy, which cut blocks by block size (please refer to
-  // `FlushBlockBySizePolicy`).
-  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
-
-  // TODO(kailiu) Temporarily disable this feature by making the default value
-  // to be false. Also in master branch, this file is non-public so no user
-  // will be able to change the value of `cache_index_and_filter_blocks`.
-  //
-  // Indicating if we'd put index/filter blocks to the block cache.
-  // If not specified, each "table reader" object will pre-load index/filter
-  // block during table initialization.
-  bool cache_index_and_filter_blocks = false;
-};
-
-}  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index b08ea1934..f4dd5b2ec 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -21,15 +21,17 @@
 #include "table/block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
 
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
-#include "table/block_based_table_options.h"
 
 namespace rocksdb {
 
+extern uint64_t kBlockBasedTableMagicNumber;
+
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
 // varints.
@@ -37,12 +39,13 @@ const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
 using std::unique_ptr;
 
 struct BlockBasedTable::Rep {
-  Rep(const EnvOptions& storage_options) :
-    soptions(storage_options) {
-  }
+  Rep(const EnvOptions& storage_options,
+      const InternalKeyComparator& internal_comparator)
+      : soptions(storage_options), internal_comparator_(internal_comparator) {}
 
   Options options;
   const EnvOptions& soptions;
+  const InternalKeyComparator& internal_comparator_;
   Status status;
   unique_ptr<RandomAccessFile> file;
   char cache_key_prefix[kMaxCacheKeyPrefixSize];
@@ -223,34 +226,19 @@ Cache::Handle* GetFromBlockCache(
 
 Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
                              const BlockBasedTableOptions& table_options,
+                             const InternalKeyComparator& internal_comparator,
                              unique_ptr<RandomAccessFile>&& file,
                              uint64_t file_size,
                              unique_ptr<TableReader>* table_reader) {
   table_reader->reset();
 
-  if (file_size < Footer::kEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
-  }
-
-  char footer_space[Footer::kEncodedLength];
-  Slice footer_input;
-  Status s = file->Read(file_size - Footer::kEncodedLength,
-                        Footer::kEncodedLength, &footer_input, footer_space);
-  if (!s.ok()) return s;
-
-  // Check that we actually read the whole footer from the file. It may be
-  // that size isn't correct.
-  if (footer_input.size() != Footer::kEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
-  }
-
-  Footer footer;
-  s = footer.DecodeFrom(&footer_input);
+  Footer footer(kBlockBasedTableMagicNumber);
+  auto s = ReadFooterFromFile(file.get(), file_size, &footer);
   if (!s.ok()) return s;
 
   // We've successfully read the footer and the index block: we're
   // ready to serve requests.
-  Rep* rep = new BlockBasedTable::Rep(soptions);
+  Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator);
   rep->options = options;
   rep->file = std::move(file);
   rep->metaindex_handle = footer.metaindex_handle();
@@ -265,10 +253,11 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
 
   // Read the properties
   meta_iter->Seek(kPropertiesBlock);
-  if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) {
+  if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock) {
     s = meta_iter->status();
     if (s.ok()) {
-      s = ReadProperties(meta_iter->value(), rep, &rep->table_properties);
+      s = ReadProperties(meta_iter->value(), rep->file.get(), rep->options.env,
+                         rep->options.info_log.get(), &rep->table_properties);
     }
 
     if (!s.ok()) {
@@ -350,7 +339,7 @@ void BlockBasedTable::SetupForCompaction() {
   compaction_optimized_ = true;
 }
 
-TableProperties& BlockBasedTable::GetTableProperties() {
+const TableProperties& BlockBasedTable::GetTableProperties() {
   return rep_->table_properties;
 }
 
@@ -415,96 +404,6 @@ FilterBlockReader* BlockBasedTable::ReadFilter (
        rep->options, block.data, block.heap_allocated);
 }
 
-Status BlockBasedTable::ReadProperties(
-    const Slice& handle_value, Rep* rep, TableProperties* table_properties) {
-  assert(table_properties);
-
-  Slice v = handle_value;
-  BlockHandle handle;
-  if (!handle.DecodeFrom(&v).ok()) {
-    return Status::InvalidArgument("Failed to decode properties block handle");
-  }
-
-  BlockContents block_contents;
-  Status s = ReadBlockContents(
-      rep->file.get(),
-      ReadOptions(),
-      handle,
-      &block_contents,
-      rep->options.env,
-      false
-  );
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  Block properties_block(block_contents);
-  std::unique_ptr<Iterator> iter(
-      properties_block.NewIterator(BytewiseComparator())
-  );
-
-  // All pre-defined properties of type uint64_t
-  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
-    { BlockBasedTablePropertiesNames::kDataSize,
-      &table_properties->data_size },
-    { BlockBasedTablePropertiesNames::kIndexSize,
-      &table_properties->index_size },
-    { BlockBasedTablePropertiesNames::kFilterSize,
-      &table_properties->filter_size },
-    { BlockBasedTablePropertiesNames::kRawKeySize,
-      &table_properties->raw_key_size },
-    { BlockBasedTablePropertiesNames::kRawValueSize,
-      &table_properties->raw_value_size },
-    { BlockBasedTablePropertiesNames::kNumDataBlocks,
-      &table_properties->num_data_blocks },
-    { BlockBasedTablePropertiesNames::kNumEntries,
-      &table_properties->num_entries },
-  };
-
-  std::string last_key;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    s = iter->status();
-    if (!s.ok()) {
-      break;
-    }
-
-    auto key = iter->key().ToString();
-    // properties block is strictly sorted with no duplicate key.
-    assert(
-        last_key.empty() ||
-        BytewiseComparator()->Compare(key, last_key) > 0
-    );
-    last_key = key;
-
-    auto raw_val = iter->value();
-    auto pos = predefined_uint64_properties.find(key);
-
-    if (pos != predefined_uint64_properties.end()) {
-      // handle predefined rocksdb properties
-      uint64_t val;
-      if (!GetVarint64(&raw_val, &val)) {
-        // skip malformed value
-        auto error_msg =
-          "[Warning] detect malformed value in properties meta-block:"
-          "\tkey: " + key + "\tval: " + raw_val.ToString();
-        Log(rep->options.info_log, "%s", error_msg.c_str());
-        continue;
-      }
-      *(pos->second) = val;
-    } else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) {
-      table_properties->filter_policy_name = raw_val.ToString();
-    } else {
-      // handle user-collected
-      table_properties->user_collected_properties.insert(
-          std::make_pair(key, raw_val.ToString())
-      );
-    }
-  }
-
-  return s;
-}
-
 Status BlockBasedTable::GetBlock(
     const BlockBasedTable* table,
     const BlockHandle& handle,
@@ -764,7 +663,7 @@ Iterator* BlockBasedTable::BlockReader(void* arg,
 
   Iterator* iter;
   if (block != nullptr) {
-    iter = block->NewIterator(table->rep_->options.comparator);
+    iter = block->NewIterator(&(table->rep_->internal_comparator_));
     if (cache_handle != nullptr) {
       iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
     } else {
@@ -837,7 +736,7 @@ BlockBasedTable::GetFilter(bool no_io) const {
 // Get the iterator from the index block.
 Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
   if (rep_->index_block) {
-    return rep_->index_block->NewIterator(rep_->options.comparator);
+    return rep_->index_block->NewIterator(&(rep_->internal_comparator_));
   }
 
   // get index block from cache
@@ -858,7 +757,7 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
 
   Iterator* iter;
   if (entry.value != nullptr) {
-    iter = entry.value->NewIterator(rep_->options.comparator);
+    iter = entry.value->NewIterator(&(rep_->internal_comparator_));
     if (entry.cache_handle) {
       iter->RegisterCleanup(
           &ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle
@@ -872,9 +771,9 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
   return iter;
 }
 
-Iterator* BlockBasedTable::BlockReader(void* arg,
-                                       const ReadOptions& options,
+Iterator* BlockBasedTable::BlockReader(void* arg, const ReadOptions& options,
                                        const EnvOptions& soptions,
+                                       const InternalKeyComparator& icomparator,
                                        const Slice& index_value,
                                        bool for_compaction) {
   return BlockReader(arg, options, index_value, nullptr, for_compaction);
@@ -965,20 +864,15 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) {
     }
   }
 
-  return NewTwoLevelIterator(
-           IndexBlockReader(options),
-           &BlockBasedTable::BlockReader,
-           const_cast<BlockBasedTable*>(this),
-           options,
-           rep_->soptions
-         );
+  return NewTwoLevelIterator(IndexBlockReader(options),
+                             &BlockBasedTable::BlockReader,
+                             const_cast<BlockBasedTable*>(this), options,
+                             rep_->soptions, rep_->internal_comparator_);
 }
 
 Status BlockBasedTable::Get(
-    const ReadOptions& readOptions,
-    const Slice& key,
-    void* handle_context,
-    bool (*result_handler)(void* handle_context, const Slice& k,
+    const ReadOptions& readOptions, const Slice& key, void* handle_context,
+    bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
                            const Slice& v, bool didIO),
     void (*mark_key_may_exist_handler)(void* handle_context)) {
   Status s;
@@ -1016,8 +910,13 @@ Status BlockBasedTable::Get(
 
       // Call the *saver function on each entry/block until it returns false
       for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
-        if (!(*result_handler)(handle_context, block_iter->key(),
-                               block_iter->value(), didIO)) {
+        ParsedInternalKey parsed_key;
+        if (!ParseInternalKey(block_iter->key(), &parsed_key)) {
+          s = Status::Corruption(Slice());
+        }
+
+        if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
+                               didIO)) {
           done = true;
           break;
         }
@@ -1034,7 +933,8 @@ Status BlockBasedTable::Get(
   return s;
 }
 
-bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) {
+bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
+               bool didIO) {
   *reinterpret_cast<bool*>(arg) = didIO;
   return false;
 }
@@ -1075,25 +975,4 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
   return result;
 }
 
-const std::string BlockBasedTable::kFilterBlockPrefix =
-    "filter.";
-const std::string BlockBasedTable::kPropertiesBlock =
-    "rocksdb.properties";
-const std::string BlockBasedTablePropertiesNames::kDataSize  =
-    "rocksdb.data.size";
-const std::string BlockBasedTablePropertiesNames::kIndexSize =
-    "rocksdb.index.size";
-const std::string BlockBasedTablePropertiesNames::kFilterSize =
-    "rocksdb.filter.size";
-const std::string BlockBasedTablePropertiesNames::kRawKeySize =
-    "rocksdb.raw.key.size";
-const std::string BlockBasedTablePropertiesNames::kRawValueSize =
-    "rocksdb.raw.value.size";
-const std::string BlockBasedTablePropertiesNames::kNumDataBlocks =
-    "rocksdb.num.data.blocks";
-const std::string BlockBasedTablePropertiesNames::kNumEntries =
-    "rocksdb.num.entries";
-const std::string BlockBasedTablePropertiesNames::kFilterPolicy =
-    "rocksdb.filter.policy";
-
 }  // namespace rocksdb
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 52ece7441..58e5b0716 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -14,8 +14,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/table_properties.h"
-#include "rocksdb/table.h"
+#include "table/table_reader.h"
 #include "util/coding.h"
 
 namespace rocksdb {
@@ -39,7 +38,6 @@ using std::unique_ptr;
 class BlockBasedTable : public TableReader {
  public:
   static const std::string kFilterBlockPrefix;
-  static const std::string kPropertiesBlock;
 
   // Attempt to open the table that is stored in bytes [0..file_size)
   // of "file", and read the metadata entries necessary to allow
@@ -53,6 +51,7 @@ class BlockBasedTable : public TableReader {
   // *file must remain live while this Table is in use.
   static Status Open(const Options& db_options, const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
+                     const InternalKeyComparator& internal_key_comparator,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table_reader);
 
@@ -63,14 +62,13 @@ class BlockBasedTable : public TableReader {
   // call one of the Seek methods on the iterator before using it).
   Iterator* NewIterator(const ReadOptions&) override;
 
-  Status Get(
-        const ReadOptions& readOptions,
-        const Slice& key,
-        void* handle_context,
-        bool (*result_handler)(void* handle_context, const Slice& k,
-                               const Slice& v, bool didIO),
-        void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
-    override;
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             void* handle_context,
+             bool (*result_handler)(void* handle_context,
+                                    const ParsedInternalKey& k, const Slice& v,
+                                    bool didIO),
+             void (*mark_key_may_exist_handler)(void* handle_context) =
+                 nullptr) override;
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
@@ -82,13 +80,13 @@ class BlockBasedTable : public TableReader {
 
   // Returns true if the block for the specified key is in cache.
   // REQUIRES: key is in this table.
-  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
 
   // Set up the table for Compaction. Might change some parameters with
   // posix_fadvise
   void SetupForCompaction() override;
 
-  TableProperties& GetTableProperties() override;
+  const TableProperties& GetTableProperties() override;
 
   ~BlockBasedTable();
 
@@ -101,8 +99,9 @@ class BlockBasedTable : public TableReader {
   bool compaction_optimized_;
 
   static Iterator* BlockReader(void*, const ReadOptions&,
-                               const EnvOptions& soptions, const Slice&,
-                               bool for_compaction);
+                               const EnvOptions& soptions,
+                               const InternalKeyComparator& icomparator,
+                               const Slice&, bool for_compaction);
 
   static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
                                bool* didIO, bool for_compaction = false);
@@ -142,7 +141,6 @@ class BlockBasedTable : public TableReader {
 
   void ReadMeta(const Footer& footer);
   void ReadFilter(const Slice& filter_handle_value);
-  static Status ReadProperties(const Slice& handle_value, Rep* rep);
 
   // Read the meta block from sst.
   static Status ReadMetaBlock(
@@ -156,10 +154,6 @@ class BlockBasedTable : public TableReader {
       Rep* rep,
       size_t* filter_size = nullptr);
 
-  // Read the table properties from properties block.
-  static Status ReadProperties(
-      const Slice& handle_value, Rep* rep, TableProperties* properties);
-
   static void SetupCacheKeyPrefix(Rep* rep);
 
   explicit BlockBasedTable(Rep* rep) :
@@ -181,15 +175,4 @@ class BlockBasedTable : public TableReader {
   void operator=(const TableReader&) = delete;
 };
 
-struct BlockBasedTablePropertiesNames {
-  static const std::string kDataSize;
-  static const std::string kIndexSize;
-  static const std::string kFilterSize;
-  static const std::string kRawKeySize;
-  static const std::string kRawValueSize;
-  static const std::string kNumDataBlocks;
-  static const std::string kNumEntries;
-  static const std::string kFilterPolicy;
-};
-
 }  // namespace rocksdb
diff --git a/table/block_builder.cc b/table/block_builder.cc
index 917601865..f812dbae7 100644
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@@ -36,6 +36,7 @@
 #include <algorithm>
 #include <assert.h>
 #include "rocksdb/comparator.h"
+#include "db/dbformat.h"
 #include "util/coding.h"
 
 namespace rocksdb {
@@ -51,9 +52,8 @@ BlockBuilder::BlockBuilder(int block_restart_interval,
   restarts_.push_back(0);       // First restart point is at offset 0
 }
 
-BlockBuilder::BlockBuilder(const Options& options)
-    : BlockBuilder(options.block_restart_interval, options.comparator) {
-}
+BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator)
+    : BlockBuilder(options.block_restart_interval, comparator) {}
 
 void BlockBuilder::Reset() {
   buffer_.clear();
diff --git a/table/block_builder.h b/table/block_builder.h
index 31faf19b8..ed2f290fd 100644
--- a/table/block_builder.h
+++ b/table/block_builder.h
@@ -21,7 +21,7 @@ class Comparator;
 class BlockBuilder {
  public:
   BlockBuilder(int block_builder, const Comparator* comparator);
-  explicit BlockBuilder(const Options& options);
+  explicit BlockBuilder(const Options& options, const Comparator* comparator);
 
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
diff --git a/table/block_test.cc b/table/block_test.cc
index 7f33e3a90..588ce6729 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -32,9 +32,12 @@ class BlockTest {};
 TEST(BlockTest, SimpleTest) {
   Random rnd(301);
   Options options = Options();
+  std::unique_ptr<InternalKeyComparator> ic;
+  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
+
   std::vector<std::string> keys;
   std::vector<std::string> values;
-  BlockBuilder builder(options);
+  BlockBuilder builder(options, ic.get());
   int num_records = 100000;
   char buf[10];
   char* p = &buf[0];
diff --git a/table/filter_block.cc b/table/filter_block.cc
index 96ba7cb1d..d7be78e1c 100644
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@@ -21,11 +21,12 @@ namespace rocksdb {
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;
 
-FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
-                 : policy_(opt.filter_policy),
-                   prefix_extractor_(opt.prefix_extractor),
-                   whole_key_filtering_(opt.whole_key_filtering),
-                   comparator_(opt.comparator){}
+FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
+                                       const Comparator* internal_comparator)
+    : policy_(opt.filter_policy),
+      prefix_extractor_(opt.prefix_extractor),
+      whole_key_filtering_(opt.whole_key_filtering),
+      comparator_(internal_comparator) {}
 
 void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
   uint64_t filter_index = (block_offset / kFilterBase);
diff --git a/table/filter_block.h b/table/filter_block.h
index e47f94653..da19d42e9 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -35,7 +35,8 @@ class FilterPolicy;
 //      (StartBlock AddKey*)* Finish
 class FilterBlockBuilder {
  public:
-  explicit FilterBlockBuilder(const Options& opt);
+  explicit FilterBlockBuilder(const Options& opt,
+                              const Comparator* internal_comparator);
 
   void StartBlock(uint64_t block_offset);
   void AddKey(const Slice& key);
diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc
index bc1a0d0ab..1703d59d1 100644
--- a/table/filter_block_test.cc
+++ b/table/filter_block_test.cc
@@ -55,7 +55,7 @@ class FilterBlockTest {
 };
 
 TEST(FilterBlockTest, EmptyBuilder) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
   Slice block = builder.Finish();
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
   FilterBlockReader reader(options_, block);
@@ -64,7 +64,7 @@ TEST(FilterBlockTest, EmptyBuilder) {
 }
 
 TEST(FilterBlockTest, SingleChunk) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
   builder.StartBlock(100);
   builder.AddKey("foo");
   builder.AddKey("bar");
@@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) {
 }
 
 TEST(FilterBlockTest, MultiChunk) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
 
   // First filter
   builder.StartBlock(0);
diff --git a/table/format.cc b/table/format.cc
index ff6d8fa24..77a55237e 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -34,6 +34,7 @@ Status BlockHandle::DecodeFrom(Slice* input) {
     return Status::Corruption("bad block handle");
   }
 }
+const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 
 void Footer::EncodeTo(std::string* dst) const {
 #ifndef NDEBUG
@@ -72,6 +73,30 @@ Status Footer::DecodeFrom(Slice* input) {
   return result;
 }
 
+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer) {
+  if (file_size < Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  char footer_space[Footer::kEncodedLength];
+  Slice footer_input;
+  Status s = file->Read(file_size - Footer::kEncodedLength,
+                        Footer::kEncodedLength,
+                        &footer_input,
+                        footer_space);
+  if (!s.ok()) return s;
+
+  // Check that we actually read the whole footer from the file. It may be
+  // that size isn't correct.
+  if (footer_input.size() != Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  return footer->DecodeFrom(&footer_input);
+}
+
 Status ReadBlockContents(RandomAccessFile* file,
                          const ReadOptions& options,
                          const BlockHandle& handle,
diff --git a/table/format.h b/table/format.h
index 2f1c1e8dc..207527fcb 100644
--- a/table/format.h
+++ b/table/format.h
@@ -26,6 +26,7 @@ struct ReadOptions;
 class BlockHandle {
  public:
   BlockHandle();
+  BlockHandle(uint64_t offset, uint64_t size);
 
   // The offset of the block in the file.
   uint64_t offset() const { return offset_; }
@@ -38,19 +39,36 @@ class BlockHandle {
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(Slice* input);
 
+  // if the block handle's offset and size are both "0", we will view it
+  // as a null block handle that points to no where.
+  bool IsNull() const {
+    return offset_ == 0 && size_ == 0;
+  }
+
+  static const BlockHandle& NullBlockHandle() {
+    return kNullBlockHandle;
+  }
+
   // Maximum encoding length of a BlockHandle
   enum { kMaxEncodedLength = 10 + 10 };
 
  private:
-  uint64_t offset_;
-  uint64_t size_;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+
+  static const BlockHandle kNullBlockHandle;
 };
 
 // Footer encapsulates the fixed information stored at the tail
 // end of every table file.
 class Footer {
  public:
-  Footer() { }
+  // @table_magic_number serves two purposes:
+  //  1. Identify different types of the tables.
+  //  2. Help us to identify if a given file is a valid sst.
+  Footer(uint64_t table_magic_number) :
+      kTableMagicNumber(table_magic_number) {
+  }
 
   // The block handle for the metaindex block of the table
   const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
@@ -77,12 +95,13 @@ class Footer {
  private:
   BlockHandle metaindex_handle_;
   BlockHandle index_handle_;
+  const uint64_t kTableMagicNumber;
 };
 
-// kTableMagicNumber was picked by running
-//    echo http://code.google.com/p/leveldb/ | sha1sum
-// and taking the leading 64 bits.
-static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
+// Read the footer from file
+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer);
 
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
@@ -115,8 +134,13 @@ extern Status UncompressBlockContents(const char* data,
 // Implementation details follow.  Clients should ignore,
 
 inline BlockHandle::BlockHandle()
-    : offset_(~static_cast<uint64_t>(0)),
-      size_(~static_cast<uint64_t>(0)) {
+    : BlockHandle(~static_cast<uint64_t>(0),
+                  ~static_cast<uint64_t>(0)) {
+}
+
+inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
+    : offset_(offset),
+      size_(size) {
 }
 
 }  // namespace rocksdb
diff --git a/table/merger.cc b/table/merger.cc
index f5ce7440c..1aed00cc5 100644
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -11,8 +11,11 @@
 
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
 #include "table/iter_heap.h"
 #include "table/iterator_wrapper.h"
+#include "util/stop_watch.h"
+#include "util/perf_context_imp.h"
 
 #include <vector>
 
@@ -22,10 +25,13 @@ namespace {
 
 class MergingIterator : public Iterator {
  public:
-  MergingIterator(const Comparator* comparator, Iterator** children, int n)
+  MergingIterator(Env* const env, const Comparator* comparator,
+                  Iterator** children, int n)
       : comparator_(comparator),
         children_(n),
         current_(nullptr),
+        use_heap_(true),
+        env_(env),
         direction_(kForward),
         maxHeap_(NewMaxIterHeap(comparator_)),
         minHeap_ (NewMinIterHeap(comparator_)) {
@@ -70,15 +76,52 @@ class MergingIterator : public Iterator {
   }
 
   virtual void Seek(const Slice& target) {
-    ClearHeaps();
+    // Invalidate the heap.
+    use_heap_ = false;
+    IteratorWrapper* first_child = nullptr;
+    StopWatchNano child_seek_timer(env_, false);
+    StopWatchNano min_heap_timer(env_, false);
     for (auto& child : children_) {
+      StartPerfTimer(&child_seek_timer);
       child.Seek(target);
+      BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
+      BumpPerfCount(&perf_context.seek_child_seek_count);
+
       if (child.Valid()) {
-        minHeap_.push(&child);
+        // This child has valid key
+        if (!use_heap_) {
+          if (first_child == nullptr) {
+            // It's the first child has valid key. Only put it int
+            // current_. Now the values in the heap should be invalid.
+            first_child = &child;
+          } else {
+            // We have more than one children with valid keys. Initialize
+            // the heap and put the first child into the heap.
+            StartPerfTimer(&min_heap_timer);
+            ClearHeaps();
+            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+            StartPerfTimer(&min_heap_timer);
+            minHeap_.push(first_child);
+            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+          }
+        }
+        if (use_heap_) {
+          StartPerfTimer(&min_heap_timer);
+          minHeap_.push(&child);
+          BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+        }
       }
     }
-    FindSmallest();
-    direction_ = kForward;
+    if (use_heap_) {
+      // If heap is valid, need to put the smallest key to curent_.
+      StartPerfTimer(&min_heap_timer);
+      FindSmallest();
+      BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+    } else {
+      // The heap is not valid, then the current_ iterator is the first
+      // one, or null if there is no first child.
+      current_ = first_child;
+    }
   }
 
   virtual void Next() {
@@ -109,10 +152,14 @@ class MergingIterator : public Iterator {
     // as the current points to the current record. move the iterator forward.
     // and if it is valid add it to the heap.
     current_->Next();
-    if (current_->Valid()){
-      minHeap_.push(current_);
+    if (use_heap_) {
+      if (current_->Valid()) {
+        minHeap_.push(current_);
+      }
+      FindSmallest();
+    } else if (!current_->Valid()) {
+      current_ = nullptr;
     }
-    FindSmallest();
   }
 
   virtual void Prev() {
@@ -178,6 +225,11 @@ class MergingIterator : public Iterator {
   const Comparator* comparator_;
   std::vector<IteratorWrapper> children_;
   IteratorWrapper* current_;
+  // If the value is true, both of iterators in the heap and current_
+  // contain valid rows. If it is false, only current_ can possibly contain
+  // valid rows.
+  bool use_heap_;
+  Env* const env_;
   // Which direction is the iterator moving?
   enum Direction {
     kForward,
@@ -189,6 +241,7 @@ class MergingIterator : public Iterator {
 };
 
 void MergingIterator::FindSmallest() {
+  assert(use_heap_);
   if (minHeap_.empty()) {
     current_ = nullptr;
   } else {
@@ -199,6 +252,7 @@ void MergingIterator::FindSmallest() {
 }
 
 void MergingIterator::FindLargest() {
+  assert(use_heap_);
   if (maxHeap_.empty()) {
     current_ = nullptr;
   } else {
@@ -209,19 +263,21 @@ void MergingIterator::FindLargest() {
 }
 
 void MergingIterator::ClearHeaps() {
+  use_heap_ = true;
   maxHeap_ = NewMaxIterHeap(comparator_);
   minHeap_ = NewMinIterHeap(comparator_);
 }
 }  // namespace
 
-Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
+Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
+                             Iterator** list, int n) {
   assert(n >= 0);
   if (n == 0) {
     return NewEmptyIterator();
   } else if (n == 1) {
     return list[0];
   } else {
-    return new MergingIterator(cmp, list, n);
+    return new MergingIterator(env, cmp, list, n);
   }
 }
 
diff --git a/table/merger.h b/table/merger.h
index dbc1f69eb..ea8daa770 100644
--- a/table/merger.h
+++ b/table/merger.h
@@ -13,6 +13,7 @@ namespace rocksdb {
 
 class Comparator;
 class Iterator;
+class Env;
 
 // Return an iterator that provided the union of the data in
 // children[0,n-1].  Takes ownership of the child iterators and
@@ -22,7 +23,8 @@ class Iterator;
 // key is present in K child iterators, it will be yielded K times.
 //
 // REQUIRES: n >= 0
-extern Iterator* NewMergingIterator(
-    const Comparator* comparator, Iterator** children, int n);
+extern Iterator* NewMergingIterator(Env* const env,
+                                    const Comparator* comparator,
+                                    Iterator** children, int n);
 
 }  // namespace rocksdb
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
new file mode 100644
index 000000000..a4d98bb22
--- /dev/null
+++ b/table/meta_blocks.cc
@@ -0,0 +1,286 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/meta_blocks.h"
+
+#include <map>
+
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+MetaIndexBuilder::MetaIndexBuilder()
+    : meta_index_block_(
+        new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void MetaIndexBuilder::Add(const std::string& key,
+                           const BlockHandle& handle) {
+  std::string handle_encoding;
+  handle.EncodeTo(&handle_encoding);
+  meta_block_handles_.insert({key, handle_encoding});
+}
+
+Slice MetaIndexBuilder::Finish() {
+  for (const auto& metablock : meta_block_handles_) {
+    meta_index_block_->Add(metablock.first, metablock.second);
+  }
+  return meta_index_block_->Finish();
+}
+
+PropertyBlockBuilder::PropertyBlockBuilder()
+  : properties_block_(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void PropertyBlockBuilder::Add(const std::string& name,
+                               const std::string& val) {
+  props_.insert({name, val});
+}
+
+void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
+  assert(props_.find(name) == props_.end());
+
+  std::string dst;
+  PutVarint64(&dst, val);
+
+  Add(name, dst);
+}
+
+void PropertyBlockBuilder::Add(
+    const UserCollectedProperties& user_collected_properties) {
+  for (const auto& prop : user_collected_properties) {
+    Add(prop.first, prop.second);
+  }
+}
+
+void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
+  Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
+  Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
+  Add(TablePropertiesNames::kDataSize, props.data_size);
+  Add(TablePropertiesNames::kIndexSize, props.index_size);
+  Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
+  Add(TablePropertiesNames::kFilterSize, props.filter_size);
+  Add(TablePropertiesNames::kFormatVersion, props.format_version);
+  Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
+
+  if (!props.filter_policy_name.empty()) {
+    Add(TablePropertiesNames::kFilterPolicy,
+        props.filter_policy_name);
+  }
+}
+
+Slice PropertyBlockBuilder::Finish() {
+  for (const auto& prop : props_) {
+    properties_block_->Add(prop.first, prop.second);
+  }
+
+  return properties_block_->Finish();
+}
+
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name) {
+  assert(method == "Add" || method == "Finish");
+
+  std::string msg =
+    "[Warning] encountered error when calling TablePropertiesCollector::" +
+    method + "() with collector name: " + name;
+  Log(info_log, "%s", msg.c_str());
+}
+
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    Status s = collector->Add(key, value);
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(
+          info_log, "Add", /* method */ collector->Name()
+      );
+    }
+  }
+  return all_succeeded;
+}
+
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    UserCollectedProperties user_collected_properties;
+    Status s = collector->Finish(&user_collected_properties);
+
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(
+          info_log, "Finish", /* method */ collector->Name()
+      );
+    } else {
+      builder->Add(user_collected_properties);
+    }
+  }
+
+  return all_succeeded;
+}
+
+Status ReadProperties(
+    const Slice& handle_value,
+    RandomAccessFile* file,
+    Env* env,
+    Logger* logger,
+    TableProperties* table_properties) {
+  assert(table_properties);
+
+  Slice v = handle_value;
+  BlockHandle handle;
+  if (!handle.DecodeFrom(&v).ok()) {
+    return Status::InvalidArgument("Failed to decode properties block handle");
+  }
+
+  BlockContents block_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  Status s = ReadBlockContents(
+      file,
+      read_options,
+      handle,
+      &block_contents,
+      env,
+      false
+  );
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  Block properties_block(block_contents);
+  std::unique_ptr<Iterator> iter(
+      properties_block.NewIterator(BytewiseComparator())
+  );
+
+  // All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+    { TablePropertiesNames::kDataSize, &table_properties->data_size },
+    { TablePropertiesNames::kIndexSize, &table_properties->index_size },
+    { TablePropertiesNames::kFilterSize, &table_properties->filter_size },
+    { TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size },
+    { TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size },
+    { TablePropertiesNames::kNumDataBlocks,
+      &table_properties->num_data_blocks },
+    { TablePropertiesNames::kNumEntries, &table_properties->num_entries },
+    { TablePropertiesNames::kFormatVersion, &table_properties->format_version },
+    { TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len },
+  };
+
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block is strictly sorted with no duplicate key.
+    assert(
+        last_key.empty() ||
+        BytewiseComparator()->Compare(key, last_key) > 0
+    );
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (pos != predefined_uint64_properties.end()) {
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+          "[Warning] detect malformed value in properties meta-block:"
+          "\tkey: " + key + "\tval: " + raw_val.ToString();
+        Log(logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      table_properties->filter_policy_name = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      table_properties->user_collected_properties.insert(
+          std::make_pair(key, raw_val.ToString())
+      );
+    }
+  }
+
+  return s;
+}
+
+Status ReadTableProperties(
+    RandomAccessFile* file,
+    uint64_t file_size,
+    uint64_t table_magic_number,
+    Env* env,
+    Logger* info_log,
+    TableProperties* properties) {
+  // -- Read metaindex block
+  Footer footer(table_magic_number);
+  auto s = ReadFooterFromFile(file, file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  s = ReadBlockContents(
+      file,
+      read_options,
+      metaindex_handle,
+      &metaindex_contents,
+      env,
+      false
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  Block metaindex_block(metaindex_contents);
+  std::unique_ptr<Iterator> meta_iter(
+      metaindex_block.NewIterator(BytewiseComparator())
+  );
+
+  // -- Read property block
+  meta_iter->Seek(kPropertiesBlock);
+  TableProperties table_properties;
+  if (meta_iter->Valid() &&
+      meta_iter->key() == kPropertiesBlock &&
+      meta_iter->status().ok()) {
+    s = ReadProperties(
+        meta_iter->value(),
+        file,
+        env,
+        info_log,
+        properties
+    );
+  } else {
+    s = Status::Corruption(
+        "Unable to read the property block from the plain table"
+    );
+  }
+
+  return s;
+}
+
+
+}  // namespace rocksdb
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
new file mode 100644
index 000000000..9f236eff6
--- /dev/null
+++ b/table/meta_blocks.h
@@ -0,0 +1,121 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_builder.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class Env;
+class Logger;
+class RandomAccessFile;
+struct TableProperties;
+
+// An STL style comparator that does the bytewise comparator comparasion
+// internally.
+struct BytewiseLessThan {
+  bool operator()(const std::string& key1, const std::string& key2) const {
+    // smaller entries will be placed in front.
+    return comparator->Compare(key1, key2) <= 0;
+  }
+
+  const Comparator* comparator = BytewiseComparator();
+};
+
+// When writing to a block that requires entries to be sorted by
+// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
+// before writng to store.
+typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
+
+class MetaIndexBuilder {
+ public:
+  MetaIndexBuilder(const MetaIndexBuilder&) = delete;
+  MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
+
+  MetaIndexBuilder();
+  void Add(const std::string& key, const BlockHandle& handle);
+
+  // Write all the added key/value pairs to the block and return the contents
+  // of the block.
+  Slice Finish();
+
+ private:
+  // store the sorted key/handle of the metablocks.
+  BytewiseSortedMap meta_block_handles_;
+  std::unique_ptr<BlockBuilder> meta_index_block_;
+};
+
+class PropertyBlockBuilder {
+ public:
+  PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
+  PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
+
+  PropertyBlockBuilder();
+
+  void AddTableProperty(const TableProperties& props);
+  void Add(const std::string& key, uint64_t value);
+  void Add(const std::string& key, const std::string& value);
+  void Add(const UserCollectedProperties& user_collected_properties);
+
+  // Write all the added entries to the block and return the block contents
+  Slice Finish();
+
+ private:
+  std::unique_ptr<BlockBuilder> properties_block_;
+  BytewiseSortedMap props_;
+};
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name);
+
+// Utility functions help table builder to trigger batch events for user
+// defined property collectors.
+// Return value indicates if there is any error occurred; if error occurred,
+// the warning message will be logged.
+// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
+// property collectors.
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log);
+
+// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
+// property collectors. The collected properties will be added to `builder`.
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder);
+
+// Read the properties from the table.
+Status ReadProperties(
+    const Slice& handle_value,
+    RandomAccessFile* file,
+    Env* env,
+    Logger* logger,
+    TableProperties* table_properties);
+
+// Directly read the properties from the properties block of a plain table.
+Status ReadTableProperties(
+    RandomAccessFile* file,
+    uint64_t file_size,
+    uint64_t table_magic_number,
+    Env* env,
+    Logger* info_log,
+    TableProperties* properties);
+
+}  // namespace rocksdb
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
new file mode 100644
index 000000000..e33ac39f2
--- /dev/null
+++ b/table/plain_table_builder.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_builder.h"
+
+#include <assert.h>
+#include <map>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "table/plain_table_factory.h"
+#include "db/dbformat.h"
+#include "table/block_builder.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+namespace {
+
+// a utility that helps writing block content to the file
+//   @offset will advance if @block_contents was successfully written.
+//   @block_handle the block handle this particular block.
+Status WriteBlock(
+    const Slice& block_contents,
+    WritableFile* file,
+    uint64_t* offset,
+    BlockHandle* block_handle) {
+  block_handle->set_offset(*offset);
+  block_handle->set_size(block_contents.size());
+  Status s = file->Append(block_contents);
+
+  if (s.ok()) {
+    *offset += block_contents.size();
+  }
+  return s;
+}
+
+}  // namespace
+
+// kPlainTableMagicNumber was picked by running
+//    echo rocksdb.plain.table | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(const Options& options,
+                                     WritableFile* file,
+                                     uint32_t user_key_len) :
+    options_(options), file_(file), user_key_len_(user_key_len) {
+  properties_.fixed_key_len = user_key_len;
+
+  // for plain table, we put all the data in a big chuck.
+  properties_.num_data_blocks = 1;
+  // emphasize that currently plain table doesn't have persistent index or
+  // filter block.
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  properties_.format_version = 0;
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+  size_t user_key_size = key.size() - 8;
+  assert(user_key_len_ == 0 || user_key_size == user_key_len_);
+
+  if (!IsFixedLength()) {
+    // Write key length
+    key_size_str_.clear();
+    PutVarint32(&key_size_str_, user_key_size);
+    file_->Append(key_size_str_);
+    offset_ += key_size_str_.length();
+  }
+
+  // Write key
+  ParsedInternalKey parsed_key;
+  if (!ParseInternalKey(key, &parsed_key)) {
+    status_ = Status::Corruption(Slice());
+    return;
+  }
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    file_->Append(Slice(key.data(), user_key_size));
+    char tmp_char = PlainTableFactory::kValueTypeSeqId0;
+    file_->Append(Slice(&tmp_char, 1));
+    offset_ += key.size() - 7;
+  } else {
+    file_->Append(key);
+    offset_ += key.size();
+  }
+
+  // Write value length
+  value_size_str_.clear();
+  int value_size = value.size();
+  PutVarint32(&value_size_str_, value_size);
+  file_->Append(value_size_str_);
+
+  // Write value
+  file_->Append(value);
+  offset_ += value_size + value_size_str_.length();
+
+  properties_.num_entries++;
+  properties_.raw_key_size += key.size();
+  properties_.raw_value_size += value.size();
+
+  // notify property collectors
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      options_.table_properties_collectors,
+      options_.info_log.get()
+  );
+}
+
+Status PlainTableBuilder::status() const { return status_; }
+
+Status PlainTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+
+  properties_.data_size = offset_;
+
+  // Write the following blocks
+  //  1. [meta block: properties]
+  //  2. [metaindex block]
+  //  3. [footer]
+  MetaIndexBuilder meta_index_builer;
+
+  PropertyBlockBuilder property_block_builder;
+  // -- Add basic properties
+  property_block_builder.AddTableProperty(properties_);
+
+  // -- Add user collected properties
+  NotifyCollectTableCollectorsOnFinish(
+      options_.table_properties_collectors,
+      options_.info_log.get(),
+      &property_block_builder
+  );
+
+  // -- Write property block
+  BlockHandle property_block_handle;
+  auto s = WriteBlock(
+      property_block_builder.Finish(),
+      file_,
+      &offset_,
+      &property_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  meta_index_builer.Add(kPropertiesBlock, property_block_handle);
+
+  // -- write metaindex block
+  BlockHandle metaindex_block_handle;
+  s = WriteBlock(
+      meta_index_builer.Finish(),
+      file_,
+      &offset_,
+      &metaindex_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Write Footer
+  Footer footer(kPlainTableMagicNumber);
+  footer.set_metaindex_handle(metaindex_block_handle);
+  footer.set_index_handle(BlockHandle::NullBlockHandle());
+  std::string footer_encoding;
+  footer.EncodeTo(&footer_encoding);
+  s = file_->Append(footer_encoding);
+  if (s.ok()) {
+    offset_ += footer_encoding.size();
+  }
+
+  return s;
+}
+
+void PlainTableBuilder::Abandon() {
+  closed_ = true;
+}
+
+uint64_t PlainTableBuilder::NumEntries() const {
+  return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const {
+  return offset_;
+}
+
+}  // namespace rocksdb
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
new file mode 100644
index 000000000..1793d1d72
--- /dev/null
+++ b/table/plain_table_builder.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/table_builder.h"
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+class PlainTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  PlainTableBuilder(const Options& options, WritableFile* file,
+                    uint32_t user_key_size);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~PlainTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  Options options_;
+  WritableFile* file_;
+  uint64_t offset_ = 0;
+  Status status_;
+  TableProperties properties_;
+
+  const size_t user_key_len_;
+  bool closed_ = false;  // Either Finish() or Abandon() has been called.
+
+  std::string key_size_str_;
+  std::string value_size_str_;
+
+  bool IsFixedLength() const {
+    return user_key_len_ > 0;
+  }
+
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
+};
+
+}  // namespace rocksdb
+
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
new file mode 100644
index 000000000..c7ee8eb2f
--- /dev/null
+++ b/table/plain_table_factory.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_factory.h"
+
+#include <memory>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "table/plain_table_builder.h"
+#include "table/plain_table_reader.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+Status PlainTableFactory::NewTableReader(const Options& options,
+                                         const EnvOptions& soptions,
+                                         const InternalKeyComparator& icomp,
+                                         unique_ptr<RandomAccessFile>&& file,
+                                         uint64_t file_size,
+                                         unique_ptr<TableReader>* table) const {
+  return PlainTableReader::Open(options, soptions, icomp, std::move(file),
+                                file_size, table, bloom_bits_per_key_,
+                                hash_table_ratio_);
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
+  return new PlainTableBuilder(options, file, user_key_len_);
+}
+
+extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
+                                          int bloom_bits_per_key,
+                                          double hash_table_ratio) {
+  return new PlainTableFactory(user_key_len, bloom_bits_per_key,
+                               hash_table_ratio);
+}
+
+}  // namespace rocksdb
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
new file mode 100644
index 000000000..382efe3c1
--- /dev/null
+++ b/table/plain_table_factory.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+struct Options;
+struct EnvOptions;
+
+using std::unique_ptr;
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// IndexedTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version     | user_key_length |
+// +------------++------------------------------+  <= key1 offset
+// | [key_size] |  key1       | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | [key_size] |  key2       | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+
+// If user_key_length = kPlainTableVariableLength, it means the key is variable
+// length, there will be an extra field for key size encoded before every key.
+class PlainTableFactory : public TableFactory {
+ public:
+  ~PlainTableFactory() {}
+  // user_key_size is the length of the user key. If it is set to be
+  // kPlainTableVariableLength, then it means variable length. Otherwise, all
+  // the keys need to have the fix length of this value. bloom_bits_per_key is
+  // number of bits used for bloom filer per key. hash_table_ratio is
+  // the desired utilization of the hash table used for prefix hashing.
+  // hash_table_ratio = number of prefixes / #buckets in the hash table
+  explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
+                             int bloom_bits_per_key = 0,
+                             double hash_table_ratio = 0.75)
+      : user_key_len_(user_key_len),
+        bloom_bits_per_key_(bloom_bits_per_key),
+        hash_table_ratio_(hash_table_ratio) {}
+  const char* Name() const override { return "PlainTable"; }
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override;
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& icomparator,
+                                WritableFile* file,
+                                CompressionType compression_type) const
+      override;
+
+  static const char kValueTypeSeqId0 = 0xFF;
+
+ private:
+  uint32_t user_key_len_;
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+};
+
+}  // namespace rocksdb
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
new file mode 100644
index 000000000..b07862bad
--- /dev/null
+++ b/table/plain_table_reader.cc
@@ -0,0 +1,695 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_reader.h"
+
+#include <string>
+
+#include "db/dbformat.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+
+#include "table/block.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/two_level_iterator.h"
+#include "table/plain_table_factory.h"
+
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/histogram.h"
+#include "util/murmurhash.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+
+
+namespace rocksdb {
+
+namespace {
+
+inline uint32_t GetSliceHash(Slice const& s) {
+  return Hash(s.data(), s.size(), 397) ;
+}
+
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  return hash % num_buckets;
+}
+
+}  // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public Iterator {
+ public:
+  explicit PlainTableIterator(PlainTableReader* table);
+  ~PlainTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+ private:
+  PlainTableReader* table_;
+  uint32_t offset_;
+  uint32_t next_offset_;
+  Slice key_;
+  Slice value_;
+  Status status_;
+  std::string tmp_str_;
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
+                                   const InternalKeyComparator& icomparator,
+                                   uint64_t file_size, int bloom_bits_per_key,
+                                   double hash_table_ratio,
+                                   const TableProperties& table_properties)
+    : soptions_(storage_options),
+      internal_comparator_(icomparator),
+      file_size_(file_size),
+      kHashTableRatio(hash_table_ratio),
+      kBloomBitsPerKey(bloom_bits_per_key),
+      table_properties_(table_properties),
+      data_end_offset_(table_properties_.data_size),
+      user_key_len_(table_properties.fixed_key_len) {}
+
+PlainTableReader::~PlainTableReader() {
+  delete[] hash_table_;
+  delete[] sub_index_;
+  delete bloom_;
+}
+
+Status PlainTableReader::Open(const Options& options,
+                              const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
+                              unique_ptr<RandomAccessFile>&& file,
+                              uint64_t file_size,
+                              unique_ptr<TableReader>* table_reader,
+                              const int bloom_bits_per_key,
+                              double hash_table_ratio) {
+  assert(options.allow_mmap_reads);
+
+  if (file_size > kMaxFileSize) {
+    return Status::NotSupported("File is too large for PlainTableReader!");
+  }
+
+  TableProperties table_properties;
+  auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                               options.env, options.info_log.get(),
+                               &table_properties);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      soptions, internal_comparator, file_size, bloom_bits_per_key,
+      hash_table_ratio, table_properties));
+  new_reader->file_ = std::move(file);
+  new_reader->options_ = options;
+
+  // -- Populate Index
+  s = new_reader->PopulateIndex();
+  if (!s.ok()) {
+    return s;
+  }
+
+  *table_reader = std::move(new_reader);
+  return s;
+}
+
+void PlainTableReader::SetupForCompaction() {
+}
+
+bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) {
+  return true;
+}
+
+Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
+  return new PlainTableIterator(this);
+}
+
+struct PlainTableReader::IndexRecord {
+  uint32_t hash; // hash of the prefix
+  uint32_t offset; // offset of a row
+  IndexRecord* next;
+};
+
+// Helper class to track all the index records
+class PlainTableReader::IndexRecordList {
+ public:
+  explicit IndexRecordList(size_t num_records_per_group)
+      : kNumRecordsPerGroup(num_records_per_group),
+        current_group_(nullptr),
+        num_records_in_current_group_(num_records_per_group) {}
+
+  ~IndexRecordList() {
+    for (size_t i = 0; i < groups_.size(); i++) {
+      delete[] groups_[i];
+    }
+  }
+
+  void AddRecord(murmur_t hash, uint32_t offset) {
+    if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+      current_group_ = AllocateNewGroup();
+      num_records_in_current_group_ = 0;
+    }
+    auto& new_record = current_group_[num_records_in_current_group_++];
+    new_record.hash = hash;
+    new_record.offset = offset;
+    new_record.next = nullptr;
+  }
+
+  size_t GetNumRecords() const {
+    return (groups_.size() - 1) * kNumRecordsPerGroup +
+           num_records_in_current_group_;
+  }
+  IndexRecord* At(size_t index) {
+    return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+  }
+
+ private:
+  IndexRecord* AllocateNewGroup() {
+    IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+    groups_.push_back(result);
+    return result;
+  }
+
+  const size_t kNumRecordsPerGroup;
+  IndexRecord* current_group_;
+  // List of arrays allocated
+  std::vector<IndexRecord*> groups_;
+  size_t num_records_in_current_group_;
+};
+
+int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) {
+  Slice prev_key_prefix_slice;
+  uint32_t prev_key_prefix_hash = 0;
+  uint32_t pos = data_start_offset_;
+  int key_index_within_prefix = 0;
+  bool is_first_record = true;
+  HistogramImpl keys_per_prefix_hist;
+  // Need map to be ordered to make sure sub indexes generated
+  // are in order.
+
+  int num_prefixes = 0;
+  while (pos < data_end_offset_) {
+    uint32_t key_offset = pos;
+    ParsedInternalKey key;
+    Slice value_slice;
+    status_ = Next(pos, &key, &value_slice, pos);
+    Slice key_prefix_slice = GetPrefix(key);
+
+    if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+      ++num_prefixes;
+      if (!is_first_record) {
+        keys_per_prefix_hist.Add(key_index_within_prefix);
+      }
+      key_index_within_prefix = 0;
+      prev_key_prefix_slice = key_prefix_slice;
+      prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
+    }
+
+    if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
+      // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+      record_list->AddRecord(prev_key_prefix_hash, key_offset);
+    }
+    is_first_record = false;
+  }
+
+  keys_per_prefix_hist.Add(key_index_within_prefix);
+  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+      keys_per_prefix_hist.ToString().c_str());
+
+  return num_prefixes;
+}
+
+void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
+  delete[] hash_table_;
+
+  if (kBloomBitsPerKey > 0) {
+    bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey);
+  }
+  double hash_table_size_multipier =
+      (kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio;
+  hash_table_size_ = num_prefixes * hash_table_size_multipier + 1;
+  hash_table_ = new uint32_t[hash_table_size_];
+}
+
+size_t PlainTableReader::BucketizeIndexesAndFillBloom(
+    IndexRecordList& record_list, int num_prefixes,
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* bucket_count) {
+  size_t sub_index_size_needed = 0;
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+      if (bloom_) {
+        bloom_->AddHash(cur_hash);
+      }
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    auto& item_count = (*bucket_count)[bucket];
+    if (item_count > 0) {
+      if (item_count == 1) {
+        sub_index_size_needed += kOffsetLen + 1;
+      }
+      if (item_count == 127) {
+        // Need more than one byte for length
+        sub_index_size_needed++;
+      }
+      sub_index_size_needed += kOffsetLen;
+    }
+    item_count++;
+  }
+  return sub_index_size_needed;
+}
+
+void PlainTableReader::FillIndexes(
+    size_t sub_index_size_needed,
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& bucket_count) {
+  Log(options_.info_log, "Reserving %zu bytes for sub index",
+      sub_index_size_needed);
+  // 8 bytes buffer for variable length size
+  size_t buffer_size = 8 * 8;
+  size_t buffer_used = 0;
+  sub_index_size_needed += buffer_size;
+  sub_index_ = new char[sub_index_size_needed];
+  size_t sub_index_offset = 0;
+  char* prev_ptr;
+  char* cur_ptr;
+  uint32_t* sub_index_ptr;
+  for (int i = 0; i < hash_table_size_; i++) {
+    uint32_t num_keys_for_bucket = bucket_count[i];
+    switch (num_keys_for_bucket) {
+    case 0:
+      // No key for bucket
+      hash_table_[i] = data_end_offset_;
+      break;
+    case 1:
+      // point directly to the file offset
+      hash_table_[i] = hash_to_offsets[i]->offset;
+      break;
+    default:
+      // point to second level indexes.
+      hash_table_[i] = sub_index_offset | kSubIndexMask;
+      prev_ptr = sub_index_ + sub_index_offset;
+      cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+      sub_index_offset += (cur_ptr - prev_ptr);
+      if (cur_ptr - prev_ptr > 2
+          || (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) {
+        // Need to resize sub_index. Exponentially grow buffer.
+        buffer_used += cur_ptr - prev_ptr - 1;
+        if (buffer_used + 4 > buffer_size) {
+          Log(options_.info_log, "Recalculate suffix_map length to %zu",
+              sub_index_size_needed);
+
+          sub_index_size_needed += buffer_size;
+          buffer_size *= 2;
+          char* new_sub_index = new char[sub_index_size_needed];
+          memcpy(new_sub_index, sub_index_, sub_index_offset);
+          delete[] sub_index_;
+          sub_index_ = new_sub_index;
+        }
+      }
+      sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset);
+      IndexRecord* record = hash_to_offsets[i];
+      int j;
+      for (j = num_keys_for_bucket - 1; j >= 0 && record;
+           j--, record = record->next) {
+        sub_index_ptr[j] = record->offset;
+      }
+      assert(j == -1 && record == nullptr);
+      sub_index_offset += kOffsetLen * num_keys_for_bucket;
+      break;
+    }
+  }
+
+  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
+      hash_table_size_, sub_index_size_needed);
+}
+
+Status PlainTableReader::PopulateIndex() {
+  // Get mmapped memory to file_data_.
+  Status s = file_->Read(0, file_size_, &file_data_, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  IndexRecordList record_list(kRecordsPerGroup);
+  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+  // for a prefix (starting from the first one), generate a record of (hash,
+  // offset) and append it to IndexRecordList, which is a data structure created
+  // to store them.
+  int num_prefixes = PopulateIndexRecordList(&record_list);
+  // Calculated hash table and bloom filter size and allocate memory for indexes
+  // and bloom filter based on the number of prefixes.
+  AllocateIndexAndBloom(num_prefixes);
+
+  // Bucketize all the index records to a temp data structure, in which for
+  // each bucket, we generate a linked list of IndexRecord, in reversed order.
+  std::vector<IndexRecord*> hash_to_offsets(hash_table_size_, nullptr);
+  std::vector<uint32_t> bucket_count(hash_table_size_, 0);
+  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
+      record_list, num_prefixes, &hash_to_offsets, &bucket_count);
+  // From the temp data structure, populate indexes.
+  FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count);
+
+  return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
+                                   uint32_t prefix_hash, bool& prefix_matched,
+                                   uint32_t& ret_offset) {
+  prefix_matched = false;
+  int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_);
+  uint32_t bucket_value = hash_table_[bucket];
+  if (bucket_value == data_end_offset_) {
+    ret_offset = data_end_offset_;
+    return Status::OK();
+  } else if ((bucket_value & kSubIndexMask) == 0) {
+    // point directly to the file
+    ret_offset = bucket_value;
+    return Status::OK();
+  }
+
+  // point to sub-index, need to do a binary search
+  uint32_t low = 0;
+  uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
+
+  const char* index_ptr = sub_index_ + prefix_index_offset;
+  uint32_t upper_bound = 0;
+  const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr,
+                                                              index_ptr + 4,
+                                                              &upper_bound);
+  uint32_t high = upper_bound;
+  ParsedInternalKey mid_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
+
+  // The key is between [low, high). Do a binary search between it.
+  while (high - low > 1) {
+    uint32_t mid = (high + low) / 2;
+    uint32_t file_offset = base_ptr[mid];
+    size_t tmp;
+    Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp);
+    if (!s.ok()) {
+      return s;
+    }
+    int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+    if (cmp_result < 0) {
+      low = mid;
+    } else {
+      if (cmp_result == 0) {
+        // Happen to have found the exact key or target is smaller than the
+        // first key after base_offset.
+        prefix_matched = true;
+        ret_offset = file_offset;
+        return Status::OK();
+      } else {
+        high = mid;
+      }
+    }
+  }
+  // Both of the key at the position low or low+1 could share the same
+  // prefix as target. We need to rule out one of them to avoid to go
+  // to the wrong prefix.
+  ParsedInternalKey low_key;
+  size_t tmp;
+  uint32_t low_key_offset = base_ptr[low];
+  Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp);
+  if (GetPrefix(low_key) == prefix) {
+    prefix_matched = true;
+    ret_offset = low_key_offset;
+  } else if (low + 1 < upper_bound) {
+    // There is possible a next prefix, return it
+    prefix_matched = false;
+    ret_offset = base_ptr[low + 1];
+  } else {
+    // target is larger than a key of the last prefix in this bucket
+    // but with a different prefix. Key does not exist.
+    ret_offset = data_end_offset_;
+  }
+  return Status::OK();
+}
+
+bool PlainTableReader::MayHavePrefix(uint32_t hash) {
+  return bloom_ == nullptr || bloom_->MayContainHash(hash);
+}
+
+Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) {
+  return options_.prefix_extractor->Transform(target.user_key);
+}
+
+Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key,
+                                 size_t& bytes_read) {
+  const char* key_ptr = nullptr;
+  bytes_read = 0;
+  size_t user_key_size = 0;
+  if (IsFixedLength()) {
+    user_key_size = user_key_len_;
+    key_ptr = row_ptr;
+  } else {
+    uint32_t tmp_size = 0;
+    key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_,
+                             &tmp_size);
+    if (key_ptr == nullptr) {
+      return Status::Corruption("Unable to read the next key");
+    }
+    user_key_size = (size_t)tmp_size;
+    bytes_read = key_ptr - row_ptr;
+  }
+  if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
+    return Status::Corruption("Unable to read the next key");
+  }
+
+  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    key->user_key = Slice(key_ptr, user_key_size);
+    key->sequence = 0;
+    key->type = kTypeValue;
+    bytes_read += user_key_size + 1;
+  } else {
+    if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
+      return Status::Corruption("Unable to read the next key");
+    }
+    if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
+      return Status::Corruption(Slice());
+    }
+    bytes_read += user_key_size + 8;
+  }
+
+  return Status::OK();
+}
+
+Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key,
+                              Slice* value, uint32_t& next_offset) {
+  if (offset == data_end_offset_) {
+    next_offset = data_end_offset_;
+    return Status::OK();
+  }
+
+  if (offset > data_end_offset_) {
+    return Status::Corruption("Offset is out of file size");
+  }
+
+  const char* row_ptr = file_data_.data() + offset;
+  size_t bytes_for_key;
+  Status s = ReadKey(row_ptr, key, bytes_for_key);
+  uint32_t value_size;
+  const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key,
+                                         file_data_.data() + data_end_offset_,
+                                         &value_size);
+  if (value_ptr == nullptr) {
+    return Status::Corruption("Error reading value length.");
+  }
+  next_offset = offset + (value_ptr - row_ptr) + value_size;
+  if (next_offset > data_end_offset_) {
+    return Status::Corruption("Reach end of file when reading value");
+  }
+  *value = Slice(value_ptr, value_size);
+
+  return Status::OK();
+}
+
+Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
+                             void* arg,
+                             bool (*saver)(void*, const ParsedInternalKey&,
+                                           const Slice&, bool),
+                             void (*mark_key_may_exist)(void*)) {
+  // Check bloom filter first.
+  Slice prefix_slice = GetPrefix(target);
+  uint32_t prefix_hash = GetSliceHash(prefix_slice);
+  if (!MayHavePrefix(prefix_hash)) {
+    return Status::OK();
+  }
+  uint32_t offset;
+  bool prefix_match;
+  Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset);
+  if (!s.ok()) {
+    return s;
+  }
+  ParsedInternalKey found_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
+
+  Slice found_value;
+  while (offset < data_end_offset_) {
+    Status s = Next(offset, &found_key, &found_value, offset);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!prefix_match) {
+      // Need to verify prefix for the first key found if it is not yet
+      // checked.
+      if (GetPrefix(found_key) != prefix_slice) {
+        return Status::OK();
+      }
+      prefix_match = true;
+    }
+    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+      if (!(*saver)(arg, found_key, found_value, true)) {
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table) :
+    table_(table) {
+  next_offset_ = offset_ = table_->data_end_offset_;
+}
+
+PlainTableIterator::~PlainTableIterator() {
+}
+
+bool PlainTableIterator::Valid() const {
+  return offset_ < table_->data_end_offset_
+      && offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+  next_offset_ = table_->data_start_offset_;
+  if (next_offset_ >= table_->data_end_offset_) {
+    next_offset_ = offset_ = table_->data_end_offset_;
+  } else {
+    Next();
+  }
+}
+
+void PlainTableIterator::SeekToLast() {
+  assert(false);
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+  Slice prefix_slice =  table_->GetPrefix(target);
+  uint32_t prefix_hash = GetSliceHash(prefix_slice);
+  if (!table_->MayHavePrefix(prefix_hash)) {
+    offset_ = next_offset_ = table_->data_end_offset_;
+    return;
+  }
+  bool prefix_match;
+  status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
+                              next_offset_);
+  if (!status_.ok()) {
+    offset_ = next_offset_ = table_->data_end_offset_;
+    return;
+  }
+
+  if (next_offset_ < table_-> data_end_offset_) {
+    for (Next(); status_.ok() && Valid(); Next()) {
+      if (!prefix_match) {
+        // Need to verify the first key's prefix
+        if (table_->GetPrefix(key()) != prefix_slice) {
+          offset_ = next_offset_ = table_->data_end_offset_;
+          break;
+        }
+        prefix_match = true;
+      }
+      if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+        break;
+      }
+    }
+  } else {
+    offset_ = table_->data_end_offset_;
+  }
+}
+
+void PlainTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ < table_->data_end_offset_) {
+    Slice tmp_slice;
+    ParsedInternalKey parsed_key;
+    status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_);
+    if (status_.ok()) {
+      // Make a copy in this case. TODO optimize.
+      tmp_str_.clear();
+      AppendInternalKey(&tmp_str_, parsed_key);
+      key_ = Slice(tmp_str_);
+    } else {
+      offset_ = next_offset_ = table_->data_end_offset_;
+    }
+  }
+}
+
+void PlainTableIterator::Prev() {
+  assert(false);
+}
+
+Slice PlainTableIterator::key() const {
+  assert(Valid());
+  return key_;
+}
+
+Slice PlainTableIterator::value() const {
+  assert(Valid());
+  return value_;
+}
+
+Status PlainTableIterator::status() const {
+  return status_;
+}
+
+}  // namespace rocksdb
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
new file mode 100644
index 000000000..1abe4e35c
--- /dev/null
+++ b/table/plain_table_reader.h
@@ -0,0 +1,220 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <unordered_map>
+#include <memory>
+#include <vector>
+#include <string>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_reader.h"
+#include "table/plain_table_factory.h"
+
+namespace rocksdb {
+
+class Block;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class DynamicBloom;
+class InternalKeyComparator;
+
+using std::unique_ptr;
+using std::unordered_map;
+extern const uint32_t kPlainTableVariableLength;
+
+// Based on following output file format shown in plain_table_factory.h
+// When opening the output file, IndexedTableReader creates a hash table
+// from key prefixes to offset of the output file. IndexedTable will decide
+// whether it points to the data offset of the first key with the key prefix
+// or the offset of it. If there are too many keys share this prefix, it will
+// create a binary search-able index from the suffix to offset on disk.
+//
+// The implementation of IndexedTableReader requires output file is mmaped
+class PlainTableReader: public TableReader {
+ public:
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     const InternalKeyComparator& internal_comparator,
+                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                     unique_ptr<TableReader>* table,
+                     const int bloom_bits_per_key, double hash_table_ratio);
+
+  bool PrefixMayMatch(const Slice& internal_prefix);
+
+  Iterator* NewIterator(const ReadOptions&);
+
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*result_handler)(void* arg, const ParsedInternalKey& k,
+                                    const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr);
+
+  uint64_t ApproximateOffsetOf(const Slice& key);
+
+  void SetupForCompaction();
+
+  const TableProperties& GetTableProperties() { return table_properties_; }
+
+  PlainTableReader(const EnvOptions& storage_options,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_size, int bloom_num_bits,
+                   double hash_table_ratio,
+                   const TableProperties& table_properties);
+  ~PlainTableReader();
+
+ private:
+  struct IndexRecord;
+  class IndexRecordList;
+
+  uint32_t* hash_table_ = nullptr;
+  int hash_table_size_ = 0;
+  char* sub_index_ = nullptr;
+
+  Options options_;
+  const EnvOptions& soptions_;
+  const InternalKeyComparator internal_comparator_;
+  Status status_;
+  unique_ptr<RandomAccessFile> file_;
+
+  Slice file_data_;
+  uint32_t version_;
+  uint32_t file_size_;
+
+  const double kHashTableRatio;
+  const int kBloomBitsPerKey;
+  DynamicBloom* bloom_ = nullptr;
+
+  TableProperties table_properties_;
+  const uint32_t data_start_offset_ = 0;
+  const uint32_t data_end_offset_;
+  const size_t user_key_len_;
+
+  static const size_t kNumInternalBytes = 8;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+  static const uint64_t kMaxFileSize = 1u << 31;
+  static const size_t kRecordsPerGroup = 256;
+  // To speed up the search for keys with same prefix, we'll add index key for
+  // every N keys, where the "N" is determined by
+  // kIndexIntervalForSamePrefixKeys
+  static const size_t kIndexIntervalForSamePrefixKeys = 16;
+
+  bool IsFixedLength() const {
+    return user_key_len_ != kPlainTableVariableLength;
+  }
+
+  size_t GetFixedInternalKeyLength() const {
+    return user_key_len_ + kNumInternalBytes;
+  }
+
+  friend class TableCache;
+  friend class PlainTableIterator;
+
+  // Internal helper function to generate an IndexRecordList object from all
+  // the rows, which contains index records as a list.
+  int PopulateIndexRecordList(IndexRecordList* record_list);
+
+  // Internal helper function to allocate memory for indexes and bloom filters
+  void AllocateIndexAndBloom(int num_prefixes);
+
+  // Internal helper function to bucket index record list to hash buckets.
+  // hash_to_offsets is sized of of hash_table_size_, each contains a linked
+  // list
+  // of offsets for the hash, in reversed order.
+  // bucket_count is sized of hash_table_size_. The value is how many index
+  // records are there in hash_to_offsets for the same bucket.
+  size_t BucketizeIndexesAndFillBloom(
+      IndexRecordList& record_list, int num_prefixes,
+      std::vector<IndexRecord*>* hash_to_offsets,
+      std::vector<uint32_t>* bucket_count);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures. hash_to_offsets and bucket_count are bucketized indexes
+  // and counts generated by BucketizeIndexesAndFillBloom().
+  void FillIndexes(size_t sub_index_size_needed,
+                   const std::vector<IndexRecord*>& hash_to_offsets,
+                   const std::vector<uint32_t>& bucket_count);
+
+  // PopulateIndex() builds index of keys. It must be called before any query
+  // to the table.
+  //
+  // hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
+  // integer. The lower 31 bits contain an offset value (explained below) and
+  // the first bit of the integer indicates type of the offset.
+  //
+  // +--------------+------------------------------------------------------+
+  // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+  // +--------------+------------------------------------------------------+
+  //
+  // Explanation for the "flag bit":
+  //
+  // 0 indicates that the bucket contains only one prefix (no conflict when
+  //   hashing this prefix), whose first row starts from this offset of the
+  // file.
+  // 1 indicates that the bucket contains more than one prefixes, or there
+  //   are too many rows for one prefix so we need a binary search for it. In
+  //   this case, the offset indicates the offset of sub_index_ holding the
+  //   binary search indexes of keys for those rows. Those binary search indexes
+  //   are organized in this way:
+  //
+  // The first 4 bytes, indicate how many indexes (N) are stored after it. After
+  // it, there are N 32-bit integers, each points of an offset of the file,
+  // which
+  // points to starting of a row. Those offsets need to be guaranteed to be in
+  // ascending order so the keys they are pointing to are also in ascending
+  // order
+  // to make sure we can use them to do binary searches. Below is visual
+  // presentation of a bucket.
+  //
+  // <begin>
+  //   number_of_records:  varint32
+  //   record 1 file offset:  fixedint32
+  //   record 2 file offset:  fixedint32
+  //    ....
+  //   record N file offset:  fixedint32
+  // <end>
+  Status PopulateIndex();
+
+  // Check bloom filter to see whether it might contain this prefix.
+  // The hash of the prefix is given, since it can be reused for index lookup
+  // too.
+  bool MayHavePrefix(uint32_t hash);
+
+  Status ReadKey(const char* row_ptr, ParsedInternalKey* key,
+                 size_t& bytes_read);
+  // Read the key and value at offset to key and value.
+  // tmp_slice is a tmp slice.
+  // return next_offset as the offset for the next key.
+  Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value,
+              uint32_t& next_offset);
+  // Get file offset for key target.
+  // return value prefix_matched is set to true if the offset is confirmed
+  // for a key with the same prefix as target.
+  Status GetOffset(const Slice& target, const Slice& prefix,
+                   uint32_t prefix_hash, bool& prefix_matched,
+                   uint32_t& ret_offset);
+
+  Slice GetPrefix(const Slice& target) {
+    assert(target.size() >= 8);  // target is internal key
+    return options_.prefix_extractor->Transform(
+        Slice(target.data(), target.size() - 8));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target);
+
+  // No copying allowed
+  explicit PlainTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+}  // namespace rocksdb
diff --git a/table/table_builder.h b/table/table_builder.h
new file mode 100644
index 000000000..ee32cff86
--- /dev/null
+++ b/table/table_builder.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Slice;
+class Status;
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_properties.cc b/table/table_properties.cc
new file mode 100644
index 000000000..414b15681
--- /dev/null
+++ b/table/table_properties.cc
@@ -0,0 +1,114 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+namespace {
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const std::string& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    props.append(key);
+    props.append(kv_delim);
+    props.append(value);
+    props.append(prop_delim);
+  }
+
+  template <class TValue>
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const TValue& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    AppendProperty(
+        props, key, std::to_string(value), prop_delim, kv_delim
+    );
+  }
+}
+
+std::string TableProperties::ToString(
+    const std::string& prop_delim,
+    const std::string& kv_delim) const {
+  std::string result;
+  result.reserve(1024);
+
+  // Basic Info
+  AppendProperty(
+      result, "# data blocks", num_data_blocks, prop_delim, kv_delim
+  );
+  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+
+  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+  AppendProperty(
+      result,
+      "raw average key size",
+      num_entries != 0 ?  1.0 * raw_key_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+  AppendProperty(
+      result, "raw value size", raw_value_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "raw average value size",
+      num_entries != 0 ?  1.0 * raw_value_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
+  AppendProperty(
+      result, "filter block size", filter_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "(estimated) table size",
+      data_size + index_size + filter_size,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(
+      result,
+      "filter policy name",
+      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+      prop_delim,
+      kv_delim
+  );
+
+  return result;
+}
+
+const std::string TablePropertiesNames::kDataSize  =
+    "rocksdb.data.size";
+const std::string TablePropertiesNames::kIndexSize =
+    "rocksdb.index.size";
+const std::string TablePropertiesNames::kFilterSize =
+    "rocksdb.filter.size";
+const std::string TablePropertiesNames::kRawKeySize =
+    "rocksdb.raw.key.size";
+const std::string TablePropertiesNames::kRawValueSize =
+    "rocksdb.raw.value.size";
+const std::string TablePropertiesNames::kNumDataBlocks =
+    "rocksdb.num.data.blocks";
+const std::string TablePropertiesNames::kNumEntries =
+    "rocksdb.num.entries";
+const std::string TablePropertiesNames::kFilterPolicy =
+    "rocksdb.filter.policy";
+const std::string TablePropertiesNames::kFormatVersion =
+    "rocksdb.format.version";
+const std::string TablePropertiesNames::kFixedKeyLen =
+    "rocksdb.fixed.key.length";
+
+extern const std::string kPropertiesBlock = "rocksdb.properties";
+
+}  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
new file mode 100644
index 000000000..9acbb33d0
--- /dev/null
+++ b/table/table_reader.h
@@ -0,0 +1,71 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Iterator;
+struct ParsedInternalKey;
+class Slice;
+struct ReadOptions;
+struct TableProperties;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Determine whether there is a chance that the current table file
+  // contains the key a key starting with iternal_prefix. The specific
+  // table implementation can use bloom filter and/or other heuristic
+  // to filter out this table as a whole.
+  virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  virtual Iterator* NewIterator(const ReadOptions&) = 0;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual const TableProperties& GetTableProperties() = 0;
+
+  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
+  // the entry found after a call to Seek(key), until result_handler returns
+  // false, where k is the actual internal key for a row found and v as the
+  // value of the key. didIO is true if I/O is involved in the operation. May
+  // not make such a call if filter policy says that key is not present.
+  //
+  // mark_key_may_exist_handler needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache, with
+  // the parameter to be handle_context.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  virtual Status Get(
+      const ReadOptions& readOptions, const Slice& key, void* handle_context,
+      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
+                             const Slice& v, bool didIO),
+      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index e7b6b0b7a..f746592fe 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -6,12 +6,13 @@
 #include <gflags/gflags.h>
 
 #include "rocksdb/db.h"
-#include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "port/atomic_pointer.h"
 #include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
 #include "util/histogram.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -33,8 +34,8 @@ static std::string MakeKey(int i, int j, bool through_db) {
   return key.Encode().ToString();
 }
 
-static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v,
-                           bool didIO) {
+static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
+                           const Slice& v, bool didIO) {
   return false;
 }
 
@@ -70,7 +71,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   Status s;
   if (!through_db) {
     env->NewWritableFile(file_name, &file, env_options);
-    tb = opts.table_factory->GetTableBuilder(opts, file.get(),
+    tb = opts.table_factory->NewTableBuilder(opts, file.get(),
                                              CompressionType::kNoCompression);
   } else {
     s = DB::Open(opts, dbname, &db);
@@ -101,7 +102,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
-    s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf),
+    s = opts.table_factory->NewTableReader(opts, env_options, std::move(raf),
                                            file_size, &table_reader);
   }
 
@@ -218,6 +219,8 @@ DEFINE_bool(iterator, false, "For test iterator");
 DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
             "the query will be against DB. Otherwise, will be directly against "
             "a table reader.");
+DEFINE_bool(plain_table, false, "Use PlainTable");
+
 
 int main(int argc, char** argv) {
   google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
@@ -230,10 +233,23 @@ int main(int argc, char** argv) {
     options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
         FLAGS_prefix_len);
   }
-  options.SetUpDefaultFlushBlockPolicyFactory();
   rocksdb::ReadOptions ro;
   rocksdb::EnvOptions env_options;
   options.create_if_missing = true;
+  options.compression = rocksdb::CompressionType::kNoCompression;
+  options.internal_comparator =
+      new rocksdb::InternalKeyComparator(options.comparator);
+
+  if (FLAGS_plain_table) {
+    options.allow_mmap_reads = true;
+    env_options.use_mmap_reads = true;
+    tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8,
+                                        0.75);
+    options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
+        FLAGS_prefix_len);
+  } else {
+    tf = new rocksdb::BlockBasedTableFactory();
+  }
   options.table_factory =
       std::shared_ptr<rocksdb::TableFactory>(tf);
   TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
diff --git a/table/table_test.cc b/table/table_test.cc
index 5b312f272..e473b8007 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6,6 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
 #include <map>
 #include <string>
 #include <memory>
@@ -16,17 +17,22 @@
 #include "util/statistics.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/memtablerep.h"
+#include "table/block.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
-#include "table/block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+
 #include "util/random.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -34,15 +40,12 @@
 namespace rocksdb {
 
 namespace {
+
 // Return reverse of "key".
 // Used to test non-lexicographic comparators.
-static std::string Reverse(const Slice& key) {
-  std::string str(key.ToString());
-  std::string rev("");
-  for (std::string::reverse_iterator rit = str.rbegin();
-       rit != str.rend(); ++rit) {
-    rev.push_back(*rit);
-  }
+std::string Reverse(const Slice& key) {
+  auto rev = key.ToString();
+  std::reverse(rev.begin(), rev.end());
   return rev;
 }
 
@@ -71,10 +74,10 @@ class ReverseKeyComparator : public Comparator {
     *key = Reverse(s);
   }
 };
-}  // namespace
-static ReverseKeyComparator reverse_key_comparator;
 
-static void Increment(const Comparator* cmp, std::string* key) {
+ReverseKeyComparator reverse_key_comparator;
+
+void Increment(const Comparator* cmp, std::string* key) {
   if (cmp == BytewiseComparator()) {
     key->push_back('\0');
   } else {
@@ -86,7 +89,6 @@ static void Increment(const Comparator* cmp, std::string* key) {
 }
 
 // An STL comparator that uses a Comparator
-namespace anon {
 struct STLLessThan {
   const Comparator* cmp;
 
@@ -96,6 +98,7 @@ struct STLLessThan {
     return cmp->Compare(Slice(a), Slice(b)) < 0;
   }
 };
+
 }  // namespace
 
 class StringSink: public WritableFile {
@@ -120,8 +123,9 @@ class StringSink: public WritableFile {
 
 class StringSource: public RandomAccessFile {
  public:
-  StringSource(const Slice& contents, uint64_t uniq_id)
-      : contents_(contents.data(), contents.size()), uniq_id_(uniq_id) {
+  StringSource(const Slice& contents, uint64_t uniq_id, bool mmap)
+      : contents_(contents.data(), contents.size()), uniq_id_(uniq_id),
+        mmap_(mmap) {
   }
 
   virtual ~StringSource() { }
@@ -136,8 +140,12 @@ class StringSource: public RandomAccessFile {
     if (offset + n > contents_.size()) {
       n = contents_.size() - offset;
     }
-    memcpy(scratch, &contents_[offset], n);
-    *result = Slice(scratch, n);
+    if (!mmap_) {
+      memcpy(scratch, &contents_[offset], n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&contents_[offset], n);
+    }
     return Status::OK();
   }
 
@@ -155,15 +163,16 @@ class StringSource: public RandomAccessFile {
  private:
   std::string contents_;
   uint64_t uniq_id_;
+  bool mmap_;
 };
 
-typedef std::map<std::string, std::string, anon::STLLessThan> KVMap;
+typedef std::map<std::string, std::string, STLLessThan> KVMap;
 
 // Helper class for tests to unify the interface between
 // BlockBuilder/TableBuilder and Block/Table.
 class Constructor {
  public:
-  explicit Constructor(const Comparator* cmp) : data_(anon::STLLessThan(cmp)) { }
+  explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) {}
   virtual ~Constructor() { }
 
   void Add(const std::string& key, const Slice& value) {
@@ -174,8 +183,9 @@ class Constructor {
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
   void Finish(const Options& options,
-              std::vector<std::string>* keys,
-              KVMap* kvmap) {
+              const InternalKeyComparator& internal_comparator,
+              std::vector<std::string>* keys, KVMap* kvmap) {
+    last_internal_key_ = &internal_comparator;
     *kvmap = data_;
     keys->clear();
     for (KVMap::const_iterator it = data_.begin();
@@ -184,12 +194,14 @@ class Constructor {
       keys->push_back(it->first);
     }
     data_.clear();
-    Status s = FinishImpl(options, *kvmap);
+    Status s = FinishImpl(options, internal_comparator, *kvmap);
     ASSERT_TRUE(s.ok()) << s.ToString();
   }
 
   // Construct the data structure from the data in "data"
-  virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) = 0;
 
   virtual Iterator* NewIterator() const = 0;
 
@@ -197,6 +209,9 @@ class Constructor {
 
   virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
 
+ protected:
+  const InternalKeyComparator* last_internal_key_;
+
  private:
   KVMap data_;
 };
@@ -210,10 +225,12 @@ class BlockConstructor: public Constructor {
   ~BlockConstructor() {
     delete block_;
   }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     delete block_;
     block_ = nullptr;
-    BlockBuilder builder(options);
+    BlockBuilder builder(options, &internal_comparator);
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
@@ -241,49 +258,97 @@ class BlockConstructor: public Constructor {
   BlockConstructor();
 };
 
-class BlockBasedTableConstructor: public Constructor {
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator: public Iterator {
  public:
-  explicit BlockBasedTableConstructor(const Comparator* cmp)
-      : Constructor(cmp) {}
-  ~BlockBasedTableConstructor() {
-    Reset();
+  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
+  virtual ~KeyConvertingIterator() { delete iter_; }
+  virtual bool Valid() const { return iter_->Valid(); }
+  virtual void Seek(const Slice& target) {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->Seek(encoded);
+  }
+  virtual void SeekToFirst() { iter_->SeekToFirst(); }
+  virtual void SeekToLast() { iter_->SeekToLast(); }
+  virtual void Next() { iter_->Next(); }
+  virtual void Prev() { iter_->Prev(); }
+
+  virtual Slice key() const {
+    assert(Valid());
+    ParsedInternalKey key;
+    if (!ParseInternalKey(iter_->key(), &key)) {
+      status_ = Status::Corruption("malformed internal key");
+      return Slice("corrupted key");
+    }
+    return key.user_key;
   }
 
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Slice value() const { return iter_->value(); }
+  virtual Status status() const {
+    return status_.ok() ? iter_->status() : status_;
+  }
+
+ private:
+  mutable Status status_;
+  Iterator* iter_;
+
+  // No copying allowed
+  KeyConvertingIterator(const KeyConvertingIterator&);
+  void operator=(const KeyConvertingIterator&);
+};
+
+class TableConstructor: public Constructor {
+ public:
+  explicit TableConstructor(const Comparator* cmp,
+                            bool convert_to_internal_key = false)
+      : Constructor(cmp), convert_to_internal_key_(convert_to_internal_key) {}
+  ~TableConstructor() { Reset(); }
+
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     Reset();
     sink_.reset(new StringSink());
-    std::unique_ptr<FlushBlockBySizePolicyFactory> flush_policy_factory(
-        new FlushBlockBySizePolicyFactory(options.block_size,
-                                          options.block_size_deviation));
-
-    BlockBasedTableBuilder builder(
-        options,
-        sink_.get(),
-        flush_policy_factory.get(),
-        options.compression);
+    unique_ptr<TableBuilder> builder;
+    builder.reset(options.table_factory->NewTableBuilder(
+        options, internal_comparator, sink_.get(), options.compression));
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
          ++it) {
-      builder.Add(it->first, it->second);
-      ASSERT_TRUE(builder.status().ok());
+      if (convert_to_internal_key_) {
+        ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue);
+        std::string encoded;
+        AppendInternalKey(&encoded, ikey);
+        builder->Add(encoded, it->second);
+      } else {
+        builder->Add(it->first, it->second);
+      }
+      ASSERT_TRUE(builder->status().ok());
     }
-    Status s = builder.Finish();
+    Status s = builder->Finish();
     ASSERT_TRUE(s.ok()) << s.ToString();
 
-    ASSERT_EQ(sink_->contents().size(), builder.FileSize());
+    ASSERT_EQ(sink_->contents().size(), builder->FileSize());
 
     // Open the table
     uniq_id_ = cur_uniq_id_++;
-    source_.reset(new StringSource(sink_->contents(), uniq_id_));
-    return options.table_factory->GetTableReader(options, soptions,
-                                                 std::move(source_),
-                                                 sink_->contents().size(),
-                                                 &table_reader_);
+    source_.reset(new StringSource(sink_->contents(), uniq_id_,
+                                   options.allow_mmap_reads));
+    return options.table_factory->NewTableReader(
+        options, soptions, internal_comparator, std::move(source_),
+        sink_->contents().size(), &table_reader_);
   }
 
   virtual Iterator* NewIterator() const {
-    return table_reader_->NewIterator(ReadOptions());
+    Iterator* iter = table_reader_->NewIterator(ReadOptions());
+    if (convert_to_internal_key_) {
+      return new KeyConvertingIterator(iter);
+    } else {
+      return iter;
+    }
   }
 
   uint64_t ApproximateOffsetOf(const Slice& key) const {
@@ -291,11 +356,12 @@ class BlockBasedTableConstructor: public Constructor {
   }
 
   virtual Status Reopen(const Options& options) {
-    source_.reset(new StringSource(sink_->contents(), uniq_id_));
-    return options.table_factory->GetTableReader(options, soptions,
-                                                 std::move(source_),
-                                                 sink_->contents().size(),
-                                                 &table_reader_);
+    source_.reset(
+        new StringSource(sink_->contents(), uniq_id_,
+                         options.allow_mmap_reads));
+    return options.table_factory->NewTableReader(
+        options, soptions, *last_internal_key_, std::move(source_),
+        sink_->contents().size(), &table_reader_);
   }
 
   virtual TableReader* table_reader() {
@@ -309,59 +375,19 @@ class BlockBasedTableConstructor: public Constructor {
     sink_.reset();
     source_.reset();
   }
+  bool convert_to_internal_key_;
 
   uint64_t uniq_id_;
   unique_ptr<StringSink> sink_;
   unique_ptr<StringSource> source_;
   unique_ptr<TableReader> table_reader_;
 
-  BlockBasedTableConstructor();
+  TableConstructor();
 
   static uint64_t cur_uniq_id_;
   const EnvOptions soptions;
 };
-uint64_t BlockBasedTableConstructor::cur_uniq_id_ = 1;
-
-// A helper class that converts internal format keys into user keys
-class KeyConvertingIterator: public Iterator {
- public:
-  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
-  virtual ~KeyConvertingIterator() { delete iter_; }
-  virtual bool Valid() const { return iter_->Valid(); }
-  virtual void Seek(const Slice& target) {
-    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
-    std::string encoded;
-    AppendInternalKey(&encoded, ikey);
-    iter_->Seek(encoded);
-  }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void SeekToLast() { iter_->SeekToLast(); }
-  virtual void Next() { iter_->Next(); }
-  virtual void Prev() { iter_->Prev(); }
-
-  virtual Slice key() const {
-    assert(Valid());
-    ParsedInternalKey key;
-    if (!ParseInternalKey(iter_->key(), &key)) {
-      status_ = Status::Corruption("malformed internal key");
-      return Slice("corrupted key");
-    }
-    return key.user_key;
-  }
-
-  virtual Slice value() const { return iter_->value(); }
-  virtual Status status() const {
-    return status_.ok() ? iter_->status() : status_;
-  }
-
- private:
-  mutable Status status_;
-  Iterator* iter_;
-
-  // No copying allowed
-  KeyConvertingIterator(const KeyConvertingIterator&);
-  void operator=(const KeyConvertingIterator&);
-};
+uint64_t TableConstructor::cur_uniq_id_ = 1;
 
 class MemTableConstructor: public Constructor {
  public:
@@ -378,7 +404,9 @@ class MemTableConstructor: public Constructor {
   ~MemTableConstructor() {
     delete memtable_->Unref();
   }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     delete memtable_->Unref();
     Options memtable_options;
     memtable_options.memtable_factory = table_factory_;
@@ -414,7 +442,9 @@ class DBConstructor: public Constructor {
   ~DBConstructor() {
     delete db_;
   }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
     delete db_;
     db_ = nullptr;
     NewDB();
@@ -480,7 +510,9 @@ static bool BZip2CompressionSupported() {
 #endif
 
 enum TestType {
-  TABLE_TEST,
+  BLOCK_BASED_TABLE_TEST,
+  PLAIN_TABLE_SEMI_FIXED_PREFIX,
+  PLAIN_TABLE_FULL_STR_PREFIX,
   BLOCK_TEST,
   MEMTABLE_TEST,
   DB_TEST
@@ -493,49 +525,98 @@ struct TestArgs {
   CompressionType compression;
 };
 
-
 static std::vector<TestArgs> GenerateArgList() {
-  std::vector<TestArgs> ret;
-  TestType test_type[4] = {TABLE_TEST, BLOCK_TEST, MEMTABLE_TEST, DB_TEST};
-  int test_type_len = 4;
-  bool reverse_compare[2] = {false, true};
-  int reverse_compare_len = 2;
-  int restart_interval[3] = {16, 1, 1024};
-  int restart_interval_len = 3;
+  std::vector<TestArgs> test_args;
+  std::vector<TestType> test_types = {
+      BLOCK_BASED_TABLE_TEST,      PLAIN_TABLE_SEMI_FIXED_PREFIX,
+      PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST,
+      MEMTABLE_TEST,               DB_TEST};
+  std::vector<bool> reverse_compare_types = {false, true};
+  std::vector<int> restart_intervals = {16, 1, 1024};
 
   // Only add compression if it is supported
-  std::vector<CompressionType> compression_types;
-  compression_types.push_back(kNoCompression);
+  std::vector<CompressionType> compression_types = {kNoCompression};
 #ifdef SNAPPY
-  if (SnappyCompressionSupported())
+  if (SnappyCompressionSupported()) {
     compression_types.push_back(kSnappyCompression);
+  }
 #endif
 
 #ifdef ZLIB
-  if (ZlibCompressionSupported())
+  if (ZlibCompressionSupported()) {
     compression_types.push_back(kZlibCompression);
+  }
 #endif
 
 #ifdef BZIP2
-  if (BZip2CompressionSupported())
+  if (BZip2CompressionSupported()) {
     compression_types.push_back(kBZip2Compression);
+  }
 #endif
 
-  for(int i =0; i < test_type_len; i++)
-    for (int j =0; j < reverse_compare_len; j++)
-      for (int k =0; k < restart_interval_len; k++)
-  for (unsigned int n =0; n < compression_types.size(); n++) {
-    TestArgs one_arg;
-    one_arg.type = test_type[i];
-    one_arg.reverse_compare = reverse_compare[j];
-    one_arg.restart_interval = restart_interval[k];
-    one_arg.compression = compression_types[n];
-    ret.push_back(one_arg);
-  }
+  for (auto test_type : test_types) {
+    for (auto reverse_compare : reverse_compare_types) {
+      if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
+          test_type == PLAIN_TABLE_FULL_STR_PREFIX) {
+        // Plain table doesn't use restart index or compression.
+        TestArgs one_arg;
+        one_arg.type = test_type;
+        one_arg.reverse_compare = reverse_compare;
+        one_arg.restart_interval = restart_intervals[0];
+        one_arg.compression = compression_types[0];
+        test_args.push_back(one_arg);
+        continue;
+      }
 
-  return ret;
+      for (auto restart_interval : restart_intervals) {
+        for (auto compression_type : compression_types) {
+          TestArgs one_arg;
+          one_arg.type = test_type;
+          one_arg.reverse_compare = reverse_compare;
+          one_arg.restart_interval = restart_interval;
+          one_arg.compression = compression_type;
+          test_args.push_back(one_arg);
+        }
+      }
+    }
+  }
+  return test_args;
 }
 
+// In order to make all tests run for plain table format, including
+// those operating on empty keys, create a new prefix transformer which
+// return fixed prefix if the slice is not shorter than the prefix length,
+// and the full slice if it is shorter.
+class FixedOrLessPrefixTransform : public SliceTransform {
+ private:
+  const size_t prefix_len_;
+
+ public:
+  explicit FixedOrLessPrefixTransform(size_t prefix_len) :
+      prefix_len_(prefix_len) {
+  }
+
+  virtual const char* Name() const {
+    return "rocksdb.FixedPrefix";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    assert(InDomain(src));
+    if (src.size() < prefix_len_) {
+      return src;
+    }
+    return Slice(src.data(), prefix_len_);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return true;
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return (dst.size() <= prefix_len_);
+  }
+};
+
 class Harness {
  public:
   Harness() : constructor_(nullptr) { }
@@ -553,9 +634,40 @@ class Harness {
     if (args.reverse_compare) {
       options_.comparator = &reverse_key_comparator;
     }
+
+    internal_comparator_.reset(
+        new test::PlainInternalKeyComparator(options_.comparator));
+
+    support_prev_ = true;
+    only_support_prefix_seek_ = false;
+    BlockBasedTableOptions table_options;
     switch (args.type) {
-      case TABLE_TEST:
-        constructor_ = new BlockBasedTableConstructor(options_.comparator);
+      case BLOCK_BASED_TABLE_TEST:
+        table_options.flush_block_policy_factory.reset(
+            new FlushBlockBySizePolicyFactory(options_.block_size,
+                                              options_.block_size_deviation));
+        options_.table_factory.reset(new BlockBasedTableFactory(table_options));
+        constructor_ = new TableConstructor(options_.comparator);
+        break;
+      case PLAIN_TABLE_SEMI_FIXED_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor = prefix_transform.get();
+        options_.allow_mmap_reads = true;
+        options_.table_factory.reset(new PlainTableFactory());
+        constructor_ = new TableConstructor(options_.comparator, true);
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case PLAIN_TABLE_FULL_STR_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor = noop_transform.get();
+        options_.allow_mmap_reads = true;
+        options_.table_factory.reset(new PlainTableFactory());
+        constructor_ = new TableConstructor(options_.comparator, true);
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
         break;
       case BLOCK_TEST:
         constructor_ = new BlockConstructor(options_.comparator);
@@ -580,10 +692,12 @@ class Harness {
   void Test(Random* rnd) {
     std::vector<std::string> keys;
     KVMap data;
-    constructor_->Finish(options_, &keys, &data);
+    constructor_->Finish(options_, *internal_comparator_, &keys, &data);
 
     TestForwardScan(keys, data);
-    TestBackwardScan(keys, data);
+    if (support_prev_) {
+      TestBackwardScan(keys, data);
+    }
     TestRandomAccess(rnd, keys, data);
   }
 
@@ -626,7 +740,7 @@ class Harness {
     KVMap::const_iterator model_iter = data.begin();
     if (kVerbose) fprintf(stderr, "---\n");
     for (int i = 0; i < 200; i++) {
-      const int toss = rnd->Uniform(5);
+      const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
       switch (toss) {
         case 0: {
           if (iter->Valid()) {
@@ -718,17 +832,20 @@ class Harness {
     } else {
       const int index = rnd->Uniform(keys.size());
       std::string result = keys[index];
-      switch (rnd->Uniform(3)) {
+      switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
         case 0:
           // Return an existing key
           break;
         case 1: {
           // Attempt to return something smaller than an existing key
-          if (result.size() > 0 && result[result.size()-1] > '\0') {
-            result[result.size()-1]--;
+          if (result.size() > 0 && result[result.size() - 1] > '\0'
+              && (!only_support_prefix_seek_
+                  || options_.prefix_extractor->Transform(result).size()
+                  < result.size())) {
+            result[result.size() - 1]--;
           }
           break;
-        }
+      }
         case 2: {
           // Return something larger than an existing key
           Increment(options_.comparator, &result);
@@ -745,50 +862,17 @@ class Harness {
  private:
   Options options_ = Options();
   Constructor* constructor_;
+  bool support_prev_;
+  bool only_support_prefix_seek_;
+  shared_ptr<InternalKeyComparator> internal_comparator_;
+  static std::unique_ptr<const SliceTransform> noop_transform;
+  static std::unique_ptr<const SliceTransform> prefix_transform;
 };
 
-// Test the empty key
-TEST(Harness, SimpleEmptyKey) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 1);
-    Add("", "v");
-    Test(&rnd);
-  }
-}
-
-TEST(Harness, SimpleSingle) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 2);
-    Add("abc", "v");
-    Test(&rnd);
-  }
-}
-
-TEST(Harness, SimpleMulti) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 3);
-    Add("abc", "v");
-    Add("abcd", "v");
-    Add("ac", "v2");
-    Test(&rnd);
-  }
-}
-
-TEST(Harness, SimpleSpecialKey) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 4);
-    Add("\xff\xff", "v3");
-    Test(&rnd);
-  }
-}
+std::unique_ptr<const SliceTransform> Harness::noop_transform(
+    NewNoopTransform());
+std::unique_ptr<const SliceTransform> Harness::prefix_transform(
+    new FixedOrLessPrefixTransform(2));
 
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
@@ -801,12 +885,30 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   return result;
 }
 
-class TableTest { };
+// Tests against all kinds of tables
+class TableTest {
+ public:
+  const InternalKeyComparator& GetPlainInternalComparator(
+      const Comparator* comp) {
+    if (!plain_internal_comparator) {
+      plain_internal_comparator.reset(
+          new test::PlainInternalKeyComparator(comp));
+    }
+    return *plain_internal_comparator;
+  }
+
+ private:
+  std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
+};
+
+class GeneralTableTest : public TableTest {};
+class BlockBasedTableTest : public TableTest {};
+class PlainTableTest : public TableTest {};
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
-TEST(TableTest, BasicTableProperties) {
-  BlockBasedTableConstructor c(BytewiseComparator());
+TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
+  TableConstructor c(BytewiseComparator());
 
   c.Add("a1", "val1");
   c.Add("b2", "val2");
@@ -824,7 +926,8 @@ TEST(TableTest, BasicTableProperties) {
   options.compression = kNoCompression;
   options.block_restart_interval = 1;
 
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
 
   auto& props = c.table_reader()->GetTableProperties();
   ASSERT_EQ(kvmap.size(), props.num_entries);
@@ -838,7 +941,7 @@ TEST(TableTest, BasicTableProperties) {
   ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
 
   // Verify data size.
-  BlockBuilder block_builder(options);
+  BlockBuilder block_builder(options, options.comparator);
   for (const auto& item : kvmap) {
     block_builder.Add(item.first, item.second);
   }
@@ -849,8 +952,8 @@ TEST(TableTest, BasicTableProperties) {
   );
 }
 
-TEST(TableTest, FilterPolicyNameProperties) {
-  BlockBasedTableConstructor c(BytewiseComparator());
+TEST(BlockBasedTableTest, FilterPolicyNameProperties) {
+  TableConstructor c(BytewiseComparator());
   c.Add("a1", "val1");
   std::vector<std::string> keys;
   KVMap kvmap;
@@ -860,7 +963,8 @@ TEST(TableTest, FilterPolicyNameProperties) {
   );
   options.filter_policy = filter_policy.get();
 
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
   auto& props = c.table_reader()->GetTableProperties();
   ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
 }
@@ -874,7 +978,7 @@ static std::string RandomString(Random* rnd, int len) {
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.
-TEST(TableTest, IndexSizeStat) {
+TEST(BlockBasedTableTest, IndexSizeStat) {
   uint64_t last_index_size = 0;
 
   // we need to use random keys since the pure human readable texts
@@ -890,7 +994,7 @@ TEST(TableTest, IndexSizeStat) {
   // Each time we load one more key to the table. the table index block
   // size is expected to be larger than last time's.
   for (size_t i = 1; i < keys.size(); ++i) {
-    BlockBasedTableConstructor c(BytewiseComparator());
+    TableConstructor c(BytewiseComparator());
     for (size_t j = 0; j < i; ++j) {
       c.Add(keys[j], "val");
     }
@@ -901,7 +1005,8 @@ TEST(TableTest, IndexSizeStat) {
     options.compression = kNoCompression;
     options.block_restart_interval = 1;
 
-    c.Finish(options, &ks, &kvmap);
+    c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
+             &kvmap);
     auto index_size =
       c.table_reader()->GetTableProperties().index_size;
     ASSERT_GT(index_size, last_index_size);
@@ -909,9 +1014,9 @@ TEST(TableTest, IndexSizeStat) {
   }
 }
 
-TEST(TableTest, NumBlockStat) {
+TEST(BlockBasedTableTest, NumBlockStat) {
   Random rnd(test::RandomSeed());
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   Options options;
   options.compression = kNoCompression;
   options.block_restart_interval = 1;
@@ -925,7 +1030,8 @@ TEST(TableTest, NumBlockStat) {
 
   std::vector<std::string> ks;
   KVMap kvmap;
-  c.Finish(options, &ks, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
+           &kvmap);
   ASSERT_EQ(
       kvmap.size(),
       c.table_reader()->GetTableProperties().num_data_blocks
@@ -972,7 +1078,7 @@ class BlockCacheProperties {
   long data_block_cache_hit = 0;
 };
 
-TEST(TableTest, BlockCacheTest) {
+TEST(BlockBasedTableTest, BlockCacheTest) {
   // -- Table construction
   Options options;
   options.create_if_missing = true;
@@ -986,9 +1092,10 @@ TEST(TableTest, BlockCacheTest) {
   std::vector<std::string> keys;
   KVMap kvmap;
 
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   c.Add("key", "value");
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
 
   // -- PART 1: Open with regular block cache.
   // Since block_cache is disabled, no cache activities will be involved.
@@ -1106,8 +1213,83 @@ TEST(TableTest, BlockCacheTest) {
   }
 }
 
-TEST(TableTest, ApproximateOffsetOfPlain) {
-  BlockBasedTableConstructor c(BytewiseComparator());
+TEST(BlockBasedTableTest, BlockCacheLeak) {
+  // Check that when we reopen a table we don't lose access to blocks already
+  // in the cache. This test checks whether the Table actually makes use of the
+  // unique ID from the file.
+
+  Options opt;
+  unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+  opt.block_size = 1024;
+  opt.compression = kNoCompression;
+  opt.block_cache =
+      NewLRUCache(16 * 1024 * 1024);  // big enough so we don't ever
+                                      // lose cached values.
+
+  TableConstructor c(BytewiseComparator());
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  c.Finish(opt, *ikc, &keys, &kvmap);
+
+  unique_ptr<Iterator> iter(c.NewIterator());
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->key();
+    iter->value();
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  ASSERT_OK(c.Reopen(opt));
+  auto table_reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
+  for (const std::string& key : keys) {
+    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
+  }
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+TEST(PlainTableTest, BasicPlainTableProperties) {
+  PlainTableFactory factory(8, 8, 0);
+  StringSink sink;
+  Options options;
+  InternalKeyComparator ikc(options.comparator);
+  std::unique_ptr<TableBuilder> builder(
+      factory.NewTableBuilder(options, ikc, &sink, kNoCompression));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(8, c);
+    key.append("\1       ");  // PlainTable expects internal key structure
+    std::string value(28, c + 42);
+    builder->Add(key, value);
+  }
+  ASSERT_OK(builder->Finish());
+
+  StringSource source(sink.contents(), 72242, true);
+
+  TableProperties props;
+  auto s = ReadTableProperties(&source, sink.contents().size(),
+                               kPlainTableMagicNumber, Env::Default(), nullptr,
+                               &props);
+  ASSERT_OK(s);
+
+  ASSERT_EQ(0ul, props.index_size);
+  ASSERT_EQ(0ul, props.filter_size);
+  ASSERT_EQ(16ul * 26, props.raw_key_size);
+  ASSERT_EQ(28ul * 26, props.raw_value_size);
+  ASSERT_EQ(26ul, props.num_entries);
+  ASSERT_EQ(1ul, props.num_data_blocks);
+}
+
+TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
+  TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");
   c.Add("k02", "hello2");
   c.Add("k03", std::string(10000, 'x'));
@@ -1118,9 +1300,10 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
   std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
+  test::PlainInternalKeyComparator internal_comparator(options.comparator);
   options.block_size = 1024;
   options.compression = kNoCompression;
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, internal_comparator, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
@@ -1136,9 +1319,9 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
 
 }
 
-static void Do_Compression_Test(CompressionType comp) {
+static void DoCompressionTest(CompressionType comp) {
   Random rnd(301);
-  BlockBasedTableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator());
   std::string tmp;
   c.Add("k01", "hello");
   c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
@@ -1147,19 +1330,20 @@ static void Do_Compression_Test(CompressionType comp) {
   std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
+  test::PlainInternalKeyComparator ikc(options.comparator);
   options.block_size = 1024;
   options.compression = comp;
-  c.Finish(options, &keys, &kvmap);
+  c.Finish(options, ikc, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6100));
 }
 
-TEST(TableTest, ApproximateOffsetOfCompressed) {
+TEST(GeneralTableTest, ApproximateOffsetOfCompressed) {
   CompressionType compression_state[2];
   int valid = 0;
   if (!SnappyCompressionSupported()) {
@@ -1178,49 +1362,11 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
 
   for(int i =0; i < valid; i++)
   {
-    Do_Compression_Test(compression_state[i]);
+    DoCompressionTest(compression_state[i]);
   }
 
 }
 
-TEST(TableTest, BlockCacheLeak) {
-  // Check that when we reopen a table we don't lose access to blocks already
-  // in the cache. This test checks whether the Table actually makes use of the
-  // unique ID from the file.
-
-  Options opt;
-  opt.block_size = 1024;
-  opt.compression = kNoCompression;
-  opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever
-                                               // lose cached values.
-
-  BlockBasedTableConstructor c(BytewiseComparator());
-  c.Add("k01", "hello");
-  c.Add("k02", "hello2");
-  c.Add("k03", std::string(10000, 'x'));
-  c.Add("k04", std::string(200000, 'x'));
-  c.Add("k05", std::string(300000, 'x'));
-  c.Add("k06", "hello3");
-  c.Add("k07", std::string(100000, 'x'));
-  std::vector<std::string> keys;
-  KVMap kvmap;
-  c.Finish(opt, &keys, &kvmap);
-
-  unique_ptr<Iterator> iter(c.NewIterator());
-  iter->SeekToFirst();
-  while (iter->Valid()) {
-    iter->key();
-    iter->value();
-    iter->Next();
-  }
-  ASSERT_OK(iter->status());
-
-  ASSERT_OK(c.Reopen(opt));
-  for (const std::string& key: keys) {
-    ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key));
-  }
-}
-
 TEST(Harness, Randomized) {
   std::vector<TestArgs> args = GenerateArgList();
   for (unsigned int i = 0; i < args.size(); i++) {
@@ -1297,6 +1443,49 @@ TEST(MemTableTest, Simple) {
   delete memtable->Unref();
 }
 
+// Test the empty key
+TEST(Harness, SimpleEmptyKey) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 1);
+    Add("", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSingle) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 2);
+    Add("abc", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleMulti) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 3);
+    Add("abc", "v");
+    Add("abcd", "v");
+    Add("ac", "v2");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSpecialKey) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 4);
+    Add("\xff\xff", "v3");
+    Test(&rnd);
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index ac2d8d3d9..65a58ad93 100644
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -20,18 +20,17 @@ namespace rocksdb {
 namespace {
 
 typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
-                                   const EnvOptions& soptions, const Slice&,
-                                   bool for_compaction);
+                                   const EnvOptions& soptions,
+                                   const InternalKeyComparator& icomparator,
+                                   const Slice&, bool for_compaction);
 
 class TwoLevelIterator: public Iterator {
  public:
-  TwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction);
+  TwoLevelIterator(Iterator* index_iter, BlockFunction block_function,
+                   void* arg, const ReadOptions& options,
+                   const EnvOptions& soptions,
+                   const InternalKeyComparator& internal_comparator,
+                   bool for_compaction);
 
   virtual ~TwoLevelIterator();
 
@@ -76,6 +75,7 @@ class TwoLevelIterator: public Iterator {
   void* arg_;
   const ReadOptions options_;
   const EnvOptions& soptions_;
+  const InternalKeyComparator& internal_comparator_;
   Status status_;
   IteratorWrapper index_iter_;
   IteratorWrapper data_iter_; // May be nullptr
@@ -86,20 +86,17 @@ class TwoLevelIterator: public Iterator {
 };
 
 TwoLevelIterator::TwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction)
+    Iterator* index_iter, BlockFunction block_function, void* arg,
+    const ReadOptions& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator, bool for_compaction)
     : block_function_(block_function),
       arg_(arg),
       options_(options),
       soptions_(soptions),
+      internal_comparator_(internal_comparator),
       index_iter_(index_iter),
       data_iter_(nullptr),
-      for_compaction_(for_compaction) {
-}
+      for_compaction_(for_compaction) {}
 
 TwoLevelIterator::~TwoLevelIterator() {
 }
@@ -181,8 +178,9 @@ void TwoLevelIterator::InitDataBlock() {
       // data_iter_ is already constructed with this iterator, so
       // no need to change anything
     } else {
-      Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
-                                          for_compaction_);
+      Iterator* iter =
+          (*block_function_)(arg_, options_, soptions_, internal_comparator_,
+                             handle, for_compaction_);
       data_block_handle_.assign(handle.data(), handle.size());
       SetDataIterator(iter);
     }
@@ -191,15 +189,14 @@ void TwoLevelIterator::InitDataBlock() {
 
 }  // namespace
 
-Iterator* NewTwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction) {
-  return new TwoLevelIterator(index_iter, block_function, arg,
-                              options, soptions, for_compaction);
+Iterator* NewTwoLevelIterator(Iterator* index_iter,
+                              BlockFunction block_function, void* arg,
+                              const ReadOptions& options,
+                              const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
+                              bool for_compaction) {
+  return new TwoLevelIterator(index_iter, block_function, arg, options,
+                              soptions, internal_comparator, for_compaction);
 }
 
 }  // namespace rocksdb
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
index 85aed3f14..d313dcb18 100644
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
@@ -14,6 +14,7 @@
 namespace rocksdb {
 
 struct ReadOptions;
+class InternalKeyComparator;
 
 // Return a new two level iterator.  A two-level iterator contains an
 // index iterator whose values point to a sequence of blocks where
@@ -27,14 +28,11 @@ struct ReadOptions;
 extern Iterator* NewTwoLevelIterator(
     Iterator* index_iter,
     Iterator* (*block_function)(
-        void* arg,
-        const ReadOptions& options,
-        const EnvOptions& soptions,
-        const Slice& index_value,
-        bool for_compaction),
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
+        void* arg, const ReadOptions& options, const EnvOptions& soptions,
+        const InternalKeyComparator& internal_comparator,
+        const Slice& index_value, bool for_compaction),
+    void* arg, const ReadOptions& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
     bool for_compaction = false);
 
 }  // namespace rocksdb
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 903889556..79b361841 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -15,6 +15,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
@@ -38,22 +39,50 @@ class SstFileReader {
                         bool has_to,
                         const std::string& to_key);
 
+  Status ReadTableProperties(TableProperties* table_properties);
   uint64_t GetReadNumber() { return read_num_; }
 
-private:
+ private:
+  Status NewTableReader(const std::string& file_path);
+
   std::string file_name_;
   uint64_t read_num_;
   bool verify_checksum_;
   bool output_hex_;
   EnvOptions soptions_;
+
+  Status init_result_;
+  unique_ptr<TableReader> table_reader_;
+  unique_ptr<RandomAccessFile> file_;
+  // table_options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options table_options_;
+  InternalKeyComparator internal_comparator_;
 };
 
 SstFileReader::SstFileReader(const std::string& file_path,
                              bool verify_checksum,
                              bool output_hex)
- :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
-  output_hex_(output_hex) {
-  std::cout << "Process " << file_path << "\n";
+    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
+    output_hex_(output_hex), internal_comparator_(BytewiseComparator()) {
+  fprintf(stdout, "Process %s\n", file_path.c_str());
+
+  init_result_ = NewTableReader(file_name_);
+}
+
+Status SstFileReader::NewTableReader(const std::string& file_path) {
+  Status s = table_options_.env->NewRandomAccessFile(file_path, &file_,
+                                                    soptions_);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t file_size;
+  table_options_.env->GetFileSize(file_path, &file_size);
+  unique_ptr<TableFactory> table_factory;
+  s = table_options_.table_factory->NewTableReader(
+      table_options_, soptions_, internal_comparator_, std::move(file_),
+      file_size, &table_reader_);
+  return s;
 }
 
 Status SstFileReader::ReadSequential(bool print_kv,
@@ -61,29 +90,12 @@ Status SstFileReader::ReadSequential(bool print_kv,
                                      bool has_from,
                                      const std::string& from_key,
                                      bool has_to,
-                                     const std::string& to_key)
-{
-  unique_ptr<TableReader> table_reader;
-  InternalKeyComparator internal_comparator_(BytewiseComparator());
-  Options table_options;
-  table_options.comparator = &internal_comparator_;
-  unique_ptr<RandomAccessFile> file;
-  Status s = table_options.env->NewRandomAccessFile(file_name_, &file,
-                                                    soptions_);
-  if(!s.ok()) {
-   return s;
-  }
-  uint64_t file_size;
-  table_options.env->GetFileSize(file_name_, &file_size);
-  unique_ptr<TableFactory> table_factory;
-  s = table_options.table_factory->GetTableReader(table_options, soptions_,
-                                                  std::move(file), file_size,
-                                                  &table_reader);
-  if(!s.ok()) {
-   return s;
+                                     const std::string& to_key) {
+  if (!table_reader_) {
+    return init_result_;
   }
 
-  Iterator* iter = table_reader->NewIterator(ReadOptions(verify_checksum_,
+  Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
                                                          false));
   uint64_t i = 0;
   if (has_from) {
@@ -113,21 +125,29 @@ Status SstFileReader::ReadSequential(bool print_kv,
     }
 
     if (print_kv) {
-      std::cout << ikey.DebugString(output_hex_)
-                << " => "
-                << value.ToString(output_hex_) << "\n";
+      fprintf(stdout, "%s => %s\n",
+          ikey.DebugString(output_hex_).c_str(),
+          value.ToString(output_hex_).c_str());
     }
+  }
 
-   }
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
 
-   read_num_ += i;
+Status SstFileReader::ReadTableProperties(TableProperties* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
 
-   Status ret = iter->status();
-   delete iter;
-   return ret;
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
 }
 
-} // namespace rocksdb
+}  // namespace rocksdb
 
 static void print_help() {
   fprintf(stderr,
@@ -137,7 +157,8 @@ static void print_help() {
       " [--input_key_hex]"
       " [--from=<user_key>]"
       " [--to=<user_key>]"
-      " [--read_num=NUM]\n");
+      " [--read_num=NUM]"
+      " [--show_properties]\n");
 }
 
 string HexToString(const string& str) {
@@ -158,7 +179,6 @@ string HexToString(const string& str) {
 }
 
 int main(int argc, char** argv) {
-
   const char* dir_or_file = nullptr;
   uint64_t read_num = -1;
   std::string command;
@@ -170,10 +190,10 @@ int main(int argc, char** argv) {
   bool input_key_hex = false;
   bool has_from = false;
   bool has_to = false;
+  bool show_properties = false;
   std::string from_key;
   std::string to_key;
-  for (int i = 1; i < argc; i++)
-  {
+  for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--file=", 7) == 0) {
       dir_or_file = argv[i] + 7;
     } else if (strcmp(argv[i], "--output_hex") == 0) {
@@ -194,7 +214,9 @@ int main(int argc, char** argv) {
     } else if (strncmp(argv[i], "--to=", 5) == 0) {
       to_key = argv[i] + 5;
       has_to = true;
-    }else {
+    } else if (strcmp(argv[i], "--show_properties") == 0) {
+      show_properties = true;
+    } else {
       print_help();
       exit(1);
     }
@@ -210,7 +232,7 @@ int main(int argc, char** argv) {
     }
   }
 
-  if(dir_or_file == nullptr) {
+  if (dir_or_file == nullptr) {
     print_help();
     exit(1);
   }
@@ -225,18 +247,19 @@ int main(int argc, char** argv) {
     dir = false;
   }
 
-  std::cout << "from [" << rocksdb::Slice(from_key).ToString(true)
-            << "] to [" << rocksdb::Slice(to_key).ToString(true) << "]\n";
+  fprintf(stdout, "from [%s] to [%s]\n",
+      rocksdb::Slice(from_key).ToString(true).c_str(),
+      rocksdb::Slice(to_key).ToString(true).c_str());
 
   uint64_t total_read = 0;
   for (size_t i = 0; i < filenames.size(); i++) {
     std::string filename = filenames.at(i);
     if (filename.length() <= 4 ||
         filename.rfind(".sst") != filename.length() - 4) {
-      //ignore
+      // ignore
       continue;
     }
-    if(dir) {
+    if (dir) {
       filename = std::string(dir_or_file) + "/" + filename;
     }
     rocksdb::SstFileReader reader(filename, verify_checksum,
@@ -257,5 +280,20 @@ int main(int argc, char** argv) {
         break;
       }
     }
+    if (show_properties) {
+      rocksdb::TableProperties table_properties;
+      st = reader.ReadTableProperties(&table_properties);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+      } else {
+        fprintf(stdout,
+            "Table Properties:\n"
+            "------------------------------\n"
+            "  %s", table_properties.ToString("\n  ", ": ").c_str());
+        fprintf(stdout, "# deleted keys: %zd\n",
+                rocksdb::GetDeletedKeys(
+                    table_properties.user_collected_properties));
+      }
+    }
   }
 }
diff --git a/util/arena_impl.cc b/util/arena.cc
similarity index 82%
rename from util/arena_impl.cc
rename to util/arena.cc
index 5125e2364..dffc8b88e 100644
--- a/util/arena_impl.cc
+++ b/util/arena.cc
@@ -7,19 +7,19 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena_impl.h"
+#include "util/arena.h"
 #include <algorithm>
 
 namespace rocksdb {
 
-const size_t ArenaImpl::kMinBlockSize = 4096;
-const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
+const size_t Arena::kMinBlockSize = 4096;
+const size_t Arena::kMaxBlockSize = 2 << 30;
 static const int kAlignUnit = sizeof(void*);
 
 size_t OptimizeBlockSize(size_t block_size) {
   // Make sure block_size is in optimal range
-  block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
-  block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
+  block_size = std::max(Arena::kMinBlockSize, block_size);
+  block_size = std::min(Arena::kMaxBlockSize, block_size);
 
   // make sure block_size is the multiple of kAlignUnit
   if (block_size % kAlignUnit != 0) {
@@ -29,19 +29,18 @@ size_t OptimizeBlockSize(size_t block_size) {
   return block_size;
 }
 
-ArenaImpl::ArenaImpl(size_t block_size)
-    : kBlockSize(OptimizeBlockSize(block_size)) {
+Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) {
   assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
          kBlockSize % kAlignUnit == 0);
 }
 
-ArenaImpl::~ArenaImpl() {
+Arena::~Arena() {
   for (const auto& block : blocks_) {
     delete[] block;
   }
 }
 
-char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
+char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   if (bytes > kBlockSize / 4) {
     // Object is more than a quarter of our block size.  Allocate it separately
     // to avoid wasting too much space in leftover bytes.
@@ -63,7 +62,7 @@ char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
   }
 }
 
-char* ArenaImpl::AllocateAligned(size_t bytes) {
+char* Arena::AllocateAligned(size_t bytes) {
   assert((kAlignUnit & (kAlignUnit - 1)) ==
          0);  // Pointer size should be a power of 2
   size_t current_mod =
@@ -83,7 +82,7 @@ char* ArenaImpl::AllocateAligned(size_t bytes) {
   return result;
 }
 
-char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
+char* Arena::AllocateNewBlock(size_t block_bytes) {
   char* block = new char[block_bytes];
   blocks_memory_ += block_bytes;
   blocks_.push_back(block);
diff --git a/util/arena_impl.h b/util/arena.h
similarity index 81%
rename from util/arena_impl.h
rename to util/arena.h
index 538385ccc..4c45417f4 100644
--- a/util/arena_impl.h
+++ b/util/arena.h
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-// ArenaImpl is an implementation of Arena class. For a request of small size,
+// Arena is an implementation of Arena class. For a request of small size,
 // it allocates a block with pre-defined block size. For a request of big
 // size, it uses malloc to directly get the requested size.
 
@@ -16,37 +16,35 @@
 #include <vector>
 #include <assert.h>
 #include <stdint.h>
-#include "rocksdb/arena.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 
-class ArenaImpl : public Arena {
+class Arena {
  public:
   // No copying allowed
-  ArenaImpl(const ArenaImpl&) = delete;
-  void operator=(const ArenaImpl&) = delete;
+  Arena(const Arena&) = delete;
+  void operator=(const Arena&) = delete;
 
   static const size_t kMinBlockSize;
   static const size_t kMaxBlockSize;
 
-  explicit ArenaImpl(size_t block_size = kMinBlockSize);
-  virtual ~ArenaImpl();
+  explicit Arena(size_t block_size = kMinBlockSize);
+  ~Arena();
 
-  virtual char* Allocate(size_t bytes) override;
+  char* Allocate(size_t bytes);
 
-  virtual char* AllocateAligned(size_t bytes) override;
+  char* AllocateAligned(size_t bytes);
 
   // Returns an estimate of the total memory usage of data allocated
   // by the arena (exclude the space allocated but not yet used for future
   // allocations).
-  virtual const size_t ApproximateMemoryUsage() {
+  const size_t ApproximateMemoryUsage() {
     return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
            alloc_bytes_remaining_;
   }
 
-  virtual const size_t MemoryAllocatedBytes() override {
-    return blocks_memory_;
-  }
+  const size_t MemoryAllocatedBytes() { return blocks_memory_; }
 
  private:
   // Number of bytes allocated in one block
@@ -72,7 +70,7 @@ class ArenaImpl : public Arena {
   size_t blocks_memory_ = 0;
 };
 
-inline char* ArenaImpl::Allocate(size_t bytes) {
+inline char* Arena::Allocate(size_t bytes) {
   // The semantics of what to return are a bit messy if we allow
   // 0-byte allocations, so we disallow them here (we don't need
   // them for our internal use).
diff --git a/util/arena_test.cc b/util/arena_test.cc
index ca6dfc99d..1b2b53175 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -7,34 +7,32 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena_impl.h"
+#include "util/arena.h"
 #include "util/random.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
 
-class ArenaImplTest { };
+class ArenaTest {};
 
-TEST(ArenaImplTest, Empty) {
-  ArenaImpl arena0;
-}
+TEST(ArenaTest, Empty) { Arena arena0; }
 
-TEST(ArenaImplTest, MemoryAllocatedBytes) {
+TEST(ArenaTest, MemoryAllocatedBytes) {
   const int N = 17;
-  size_t req_sz;  //requested size
+  size_t req_sz;  // requested size
   size_t bsz = 8192;  // block size
   size_t expected_memory_allocated;
 
-  ArenaImpl arena_impl(bsz);
+  Arena arena(bsz);
 
   // requested size > quarter of a block:
   //   allocate requested size separately
   req_sz = 3001;
   for (int i = 0; i < N; i++) {
-    arena_impl.Allocate(req_sz);
+    arena.Allocate(req_sz);
   }
   expected_memory_allocated = req_sz * N;
-  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
 
   // requested size < quarter of a block:
   //   allocate a block with the default size, then try to use unused part
@@ -42,28 +40,28 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
   //   Allocate(99) call. All the remaining calls won't lead to new allocation.
   req_sz = 99;
   for (int i = 0; i < N; i++) {
-    arena_impl.Allocate(req_sz);
+    arena.Allocate(req_sz);
   }
   expected_memory_allocated += bsz;
-  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
 
   // requested size > quarter of a block:
   //   allocate requested size separately
   req_sz = 99999999;
   for (int i = 0; i < N; i++) {
-    arena_impl.Allocate(req_sz);
+    arena.Allocate(req_sz);
   }
   expected_memory_allocated += req_sz * N;
-  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
 }
 
 // Make sure we didn't count the allocate but not used memory space in
 // Arena::ApproximateMemoryUsage()
-TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
+TEST(ArenaTest, ApproximateMemoryUsageTest) {
   const size_t kBlockSize = 4096;
   const size_t kEntrySize = kBlockSize / 8;
-	const size_t kZero = 0;
-  ArenaImpl arena(kBlockSize);
+  const size_t kZero = 0;
+  Arena arena(kBlockSize);
   ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
 
   auto num_blocks = kBlockSize / kEntrySize;
@@ -83,9 +81,9 @@ TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
   ASSERT_GT(usage, mem_usage);
 }
 
-TEST(ArenaImplTest, Simple) {
+TEST(ArenaTest, Simple) {
   std::vector<std::pair<size_t, char*>> allocated;
-  ArenaImpl arena_impl;
+  Arena arena;
   const int N = 100000;
   size_t bytes = 0;
   Random rnd(301);
@@ -104,9 +102,9 @@ TEST(ArenaImplTest, Simple) {
     }
     char* r;
     if (rnd.OneIn(10)) {
-      r = arena_impl.AllocateAligned(s);
+      r = arena.AllocateAligned(s);
     } else {
-      r = arena_impl.Allocate(s);
+      r = arena.Allocate(s);
     }
 
     for (unsigned int b = 0; b < s; b++) {
@@ -115,9 +113,9 @@ TEST(ArenaImplTest, Simple) {
     }
     bytes += s;
     allocated.push_back(std::make_pair(s, r));
-    ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
+    ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
     if (i > N / 10) {
-      ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
+      ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
     }
   }
   for (unsigned int i = 0; i < allocated.size(); i++) {
@@ -132,6 +130,4 @@ TEST(ArenaImplTest, Simple) {
 
 }  // namespace rocksdb
 
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/util/autovector.h b/util/autovector.h
index 9998e2956..812a61795 100644
--- a/util/autovector.h
+++ b/util/autovector.h
@@ -57,11 +57,9 @@ class autovector {
     typedef std::random_access_iterator_tag iterator_category;
 
     iterator_impl(TAutoVector* vect, size_t index)
-      : vect_(vect)
-      , index_(index) {
-    };
+        : vect_(vect), index_(index) {};
     iterator_impl(const iterator_impl&) = default;
-    ~iterator_impl() { }
+    ~iterator_impl() {}
     iterator_impl& operator=(const iterator_impl&) = default;
 
     // -- Advancement
@@ -130,9 +128,7 @@ class autovector {
       return index_ == other.index_;
     }
 
-    bool operator!=(const self_type& other) const {
-      return !(*this == other);
-    }
+    bool operator!=(const self_type& other) const { return !(*this == other); }
 
     bool operator>(const self_type& other) const {
       assert(vect_ == other.vect_);
@@ -174,13 +170,9 @@ class autovector {
     return vect_.capacity() == 0;
   }
 
-  size_type size() const {
-    return num_stack_items_ + vect_.size();
-  }
+  size_type size() const { return num_stack_items_ + vect_.size(); }
 
-  bool empty() const {
-    return size() == 0;
-  }
+  bool empty() const { return size() == 0; }
 
   // will not check boundry
   const_reference operator[](size_type n) const {
@@ -235,11 +227,9 @@ class autovector {
     }
   }
 
-  void push_back(const T& item) {
-    push_back(value_type(item));
-  }
+  void push_back(const T& item) { push_back(value_type(item)); }
 
-  template<class... Args>
+  template <class... Args>
   void emplace_back(Args&&... args) {
     push_back(value_type(args...));
   }
@@ -261,13 +251,9 @@ class autovector {
   // -- Copy and Assignment
   autovector& assign(const autovector& other);
 
-  autovector(const autovector& other) {
-    assign(other);
-  }
+  autovector(const autovector& other) { assign(other); }
 
-  autovector& operator=(const autovector& other) {
-    return assign(other);
-  }
+  autovector& operator=(const autovector& other) { return assign(other); }
 
   // move operation are disallowed since it is very hard to make sure both
   // autovectors are allocated from the same function stack.
@@ -275,41 +261,29 @@ class autovector {
   autovector(autovector&& other) = delete;
 
   // -- Iterator Operations
-  iterator begin() {
-    return iterator(this, 0);
-  }
+  iterator begin() { return iterator(this, 0); }
 
-  const_iterator begin() const {
-    return const_iterator(this, 0);
-  }
+  const_iterator begin() const { return const_iterator(this, 0); }
 
-  iterator end() {
-    return iterator(this, this->size());
-  }
+  iterator end() { return iterator(this, this->size()); }
 
-  const_iterator end() const {
-    return const_iterator(this, this->size());
-  }
+  const_iterator end() const { return const_iterator(this, this->size()); }
 
-  reverse_iterator rbegin() {
-    return reverse_iterator(end());
-  }
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
 
   const_reverse_iterator rbegin() const {
     return const_reverse_iterator(end());
   }
 
-  reverse_iterator rend() {
-    return reverse_iterator(begin());
-  }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
 
   const_reverse_iterator rend() const {
     return const_reverse_iterator(begin());
   }
 
  private:
-  size_type num_stack_items_ = 0; // current number of items
-  value_type values_[kSize]; // the first `kSize` items
+  size_type num_stack_items_ = 0;  // current number of items
+  value_type values_[kSize];       // the first `kSize` items
   // used only if there are more than `kSize` items.
   std::vector<T> vect_;
 };
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 9dbd5d2cc..2c430e203 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -7,12 +7,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <gflags/gflags.h>
+
 #include "rocksdb/filter_policy.h"
 
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
+DEFINE_int32(bits_per_key, 10, "");
+
 namespace rocksdb {
 
 static const int kVerbose = 1;
@@ -29,7 +33,7 @@ class BloomTest {
   std::vector<std::string> keys_;
 
  public:
-  BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
+  BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { }
 
   ~BloomTest() {
     delete policy_;
@@ -160,5 +164,7 @@ TEST(BloomTest, VaryingLengths) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
   return rocksdb::test::RunAllTests();
 }
diff --git a/util/cache.cc b/util/cache.cc
index 4707eac94..8f7deaaa8 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -10,10 +10,10 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <vector>
 
 #include "rocksdb/cache.h"
 #include "port/port.h"
+#include "util/autovector.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
 
@@ -156,6 +156,13 @@ class LRUCache {
   Cache::Handle* Lookup(const Slice& key, uint32_t hash);
   void Release(Cache::Handle* handle);
   void Erase(const Slice& key, uint32_t hash);
+  // Although in some platforms the update of size_t is atomic, to make sure
+  // GetUsage() works correctly under any platforms, we'll protect this
+  // function with mutex.
+  size_t GetUsage() const {
+    MutexLock l(&mutex_);
+    return usage_;
+  }
 
  private:
   void LRU_Remove(LRUHandle* e);
@@ -171,7 +178,9 @@ class LRUCache {
   uint32_t remove_scan_count_limit_;
 
   // mutex_ protects the following state.
-  port::Mutex mutex_;
+  // We don't count mutex_ as the cache's internal state so semantically we
+  // don't mind mutex_ invoking the non-const actions.
+  mutable port::Mutex mutex_;
   size_t usage_;
 
   // Dummy head of LRU list.
@@ -255,8 +264,7 @@ Cache::Handle* LRUCache::Insert(
 
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       malloc(sizeof(LRUHandle)-1 + key.size()));
-  std::vector<LRUHandle*> last_reference_list;
-  last_reference_list.reserve(1);
+  autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
   e->deleter = deleter;
@@ -342,10 +350,10 @@ static int kRemoveScanCountLimit = 0; // default values, can be overridden
 
 class ShardedLRUCache : public Cache {
  private:
-  LRUCache* shard_;
+  LRUCache* shards_;
   port::Mutex id_mutex_;
   uint64_t last_id_;
-  int numShardBits;
+  int num_shard_bits_;
   size_t capacity_;
 
   static inline uint32_t HashSlice(const Slice& s) {
@@ -354,18 +362,18 @@ class ShardedLRUCache : public Cache {
 
   uint32_t Shard(uint32_t hash) {
     // Note, hash >> 32 yields hash in gcc, not the zero we expect!
-    return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0;
+    return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
   }
 
   void init(size_t capacity, int numbits, int removeScanCountLimit) {
-    numShardBits = numbits;
+    num_shard_bits_ = numbits;
     capacity_ = capacity;
-    int numShards = 1 << numShardBits;
-    shard_ = new LRUCache[numShards];
-    const size_t per_shard = (capacity + (numShards - 1)) / numShards;
-    for (int s = 0; s < numShards; s++) {
-      shard_[s].SetCapacity(per_shard);
-      shard_[s].SetRemoveScanCountLimit(removeScanCountLimit);
+    int num_shards = 1 << num_shard_bits_;
+    shards_ = new LRUCache[num_shards];
+    const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
+    for (int s = 0; s < num_shards; s++) {
+      shards_[s].SetCapacity(per_shard);
+      shards_[s].SetRemoveScanCountLimit(removeScanCountLimit);
     }
   }
 
@@ -374,30 +382,30 @@ class ShardedLRUCache : public Cache {
       : last_id_(0) {
     init(capacity, kNumShardBits, kRemoveScanCountLimit);
   }
-  ShardedLRUCache(size_t capacity, int numShardBits,
+  ShardedLRUCache(size_t capacity, int num_shard_bits,
                   int removeScanCountLimit)
      : last_id_(0) {
-    init(capacity, numShardBits, removeScanCountLimit);
+    init(capacity, num_shard_bits, removeScanCountLimit);
   }
   virtual ~ShardedLRUCache() {
-    delete[] shard_;
+    delete[] shards_;
   }
   virtual Handle* Insert(const Slice& key, void* value, size_t charge,
                          void (*deleter)(const Slice& key, void* value)) {
     const uint32_t hash = HashSlice(key);
-    return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
+    return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter);
   }
   virtual Handle* Lookup(const Slice& key) {
     const uint32_t hash = HashSlice(key);
-    return shard_[Shard(hash)].Lookup(key, hash);
+    return shards_[Shard(hash)].Lookup(key, hash);
   }
   virtual void Release(Handle* handle) {
     LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
-    shard_[Shard(h->hash)].Release(handle);
+    shards_[Shard(h->hash)].Release(handle);
   }
   virtual void Erase(const Slice& key) {
     const uint32_t hash = HashSlice(key);
-    shard_[Shard(hash)].Erase(key, hash);
+    shards_[Shard(hash)].Erase(key, hash);
   }
   virtual void* Value(Handle* handle) {
     return reinterpret_cast<LRUHandle*>(handle)->value;
@@ -406,11 +414,23 @@ class ShardedLRUCache : public Cache {
     MutexLock l(&id_mutex_);
     return ++(last_id_);
   }
-  virtual size_t GetCapacity() {
+  virtual size_t GetCapacity() const {
     return capacity_;
   }
+
+  virtual size_t GetUsage() const {
+    // We will not lock the cache when getting the usage from shards.
+    // for (size_t i = 0; i < num_shard_bits_; ++i)
+    int num_shards = 1 << num_shard_bits_;
+    size_t usage = 0;
+    for (int s = 0; s < num_shards; s++) {
+      usage += shards_[s].GetUsage();
+    }
+    return usage;
+  }
+
   virtual void DisownData() {
-    shard_ = nullptr;
+    shards_ = nullptr;
   }
 };
 
@@ -420,17 +440,17 @@ shared_ptr<Cache> NewLRUCache(size_t capacity) {
   return NewLRUCache(capacity, kNumShardBits);
 }
 
-shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits) {
-  return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit);
+shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits) {
+  return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit);
 }
 
-shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
                               int removeScanCountLimit) {
-  if (numShardBits >= 20) {
+  if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
   return std::make_shared<ShardedLRUCache>(capacity,
-                                           numShardBits,
+                                           num_shard_bits,
                                            removeScanCountLimit);
 }
 
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 87ab91389..b99f47b38 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -107,6 +107,39 @@ class CacheTest {
 };
 CacheTest* CacheTest::current_;
 
+void dumbDeleter(const Slice& key, void* value) { }
+
+TEST(CacheTest, UsageTest) {
+  // cache is shared_ptr and will be automatically cleaned up.
+  const uint64_t kCapacity = 100000;
+  auto cache = NewLRUCache(kCapacity, 8, 200);
+
+  size_t usage = 0;
+  const char* value = "abcdef";
+  // make sure everything will be cached
+  for (int i = 1; i < 100; ++i) {
+    std::string key(i, 'a');
+    auto kv_size = key.size() + 5;
+    cache->Release(
+        cache->Insert(key, (void*)value, kv_size, dumbDeleter)
+    );
+    usage += kv_size;
+    ASSERT_EQ(usage, cache->GetUsage());
+  }
+
+  // make sure the cache will be overloaded
+  for (uint64_t i = 1; i < kCapacity; ++i) {
+    auto key = std::to_string(i);
+    cache->Release(
+        cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)
+    );
+  }
+
+  // the usage should be close to the capacity
+  ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+}
+
 TEST(CacheTest, HitAndMiss) {
   ASSERT_EQ(-1, Lookup(100));
 
@@ -353,7 +386,6 @@ void deleter(const Slice& key, void* value) {
   delete (Value *)value;
 }
 
-
 TEST(CacheTest, BadEviction) {
   int n = 10;
 
diff --git a/util/coding.cc b/util/coding.cc
index 6cf67efad..31ae0e356 100644
--- a/util/coding.cc
+++ b/util/coding.cc
@@ -9,131 +9,41 @@
 
 #include "util/coding.h"
 
+#include <algorithm>
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 
-
-#include <algorithm>
-
 namespace rocksdb {
 
-void EncodeFixed32(char* buf, uint32_t value) {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  memcpy(buf, &value, sizeof(value));
-#else
-  buf[0] = value & 0xff;
-  buf[1] = (value >> 8) & 0xff;
-  buf[2] = (value >> 16) & 0xff;
-  buf[3] = (value >> 24) & 0xff;
-#endif
-}
-
-void EncodeFixed64(char* buf, uint64_t value) {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  memcpy(buf, &value, sizeof(value));
-#else
-  buf[0] = value & 0xff;
-  buf[1] = (value >> 8) & 0xff;
-  buf[2] = (value >> 16) & 0xff;
-  buf[3] = (value >> 24) & 0xff;
-  buf[4] = (value >> 32) & 0xff;
-  buf[5] = (value >> 40) & 0xff;
-  buf[6] = (value >> 48) & 0xff;
-  buf[7] = (value >> 56) & 0xff;
-#endif
-}
-
-void PutFixed32(std::string* dst, uint32_t value) {
-  char buf[sizeof(value)];
-  EncodeFixed32(buf, value);
-  dst->append(buf, sizeof(buf));
-}
-
-void PutFixed64(std::string* dst, uint64_t value) {
-  char buf[sizeof(value)];
-  EncodeFixed64(buf, value);
-  dst->append(buf, sizeof(buf));
-}
-
 char* EncodeVarint32(char* dst, uint32_t v) {
   // Operate on characters as unsigneds
   unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
   static const int B = 128;
-  if (v < (1<<7)) {
+  if (v < (1 << 7)) {
     *(ptr++) = v;
-  } else if (v < (1<<14)) {
+  } else if (v < (1 << 14)) {
     *(ptr++) = v | B;
-    *(ptr++) = v>>7;
-  } else if (v < (1<<21)) {
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
     *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = v>>14;
-  } else if (v < (1<<28)) {
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
     *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = (v>>14) | B;
-    *(ptr++) = v>>21;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
   } else {
     *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = (v>>14) | B;
-    *(ptr++) = (v>>21) | B;
-    *(ptr++) = v>>28;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
   }
   return reinterpret_cast<char*>(ptr);
 }
 
-void PutVarint32(std::string* dst, uint32_t v) {
-  char buf[5];
-  char* ptr = EncodeVarint32(buf, v);
-  dst->append(buf, ptr - buf);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const unsigned int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B-1)) | B;
-    v >>= 7;
-  }
-  *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
-}
-
-void PutVarint64(std::string* dst, uint64_t v) {
-  char buf[10];
-  char* ptr = EncodeVarint64(buf, v);
-  dst->append(buf, ptr - buf);
-}
-
-void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
-  PutVarint32(dst, value.size());
-  dst->append(value.data(), value.size());
-}
-
-void PutLengthPrefixedSliceParts(std::string* dst,
-                                 const SliceParts& slice_parts) {
-  uint32_t total_bytes = 0;
-  for (int i = 0; i < slice_parts.num_parts; ++i) {
-    total_bytes += slice_parts.parts[i].size();
-  }
-  PutVarint32(dst, total_bytes);
-  for (int i = 0; i < slice_parts.num_parts; ++i) {
-    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
-  }
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-const char* GetVarint32PtrFallback(const char* p,
-                                   const char* limit,
+const char* GetVarint32PtrFallback(const char* p, const char* limit,
                                    uint32_t* value) {
   uint32_t result = 0;
   for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
@@ -151,18 +61,6 @@ const char* GetVarint32PtrFallback(const char* p,
   return nullptr;
 }
 
-bool GetVarint32(Slice* input, uint32_t* value) {
-  const char* p = input->data();
-  const char* limit = p + input->size();
-  const char* q = GetVarint32Ptr(p, limit, value);
-  if (q == nullptr) {
-    return false;
-  } else {
-    *input = Slice(q, limit - q);
-    return true;
-  }
-}
-
 const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
   uint64_t result = 0;
   for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
@@ -180,58 +78,6 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
   return nullptr;
 }
 
-bool GetVarint64(Slice* input, uint64_t* value) {
-  const char* p = input->data();
-  const char* limit = p + input->size();
-  const char* q = GetVarint64Ptr(p, limit, value);
-  if (q == nullptr) {
-    return false;
-  } else {
-    *input = Slice(q, limit - q);
-    return true;
-  }
-}
-
-const char* GetLengthPrefixedSlice(const char* p, const char* limit,
-                                   Slice* result) {
-  uint32_t len;
-  p = GetVarint32Ptr(p, limit, &len);
-  if (p == nullptr) return nullptr;
-  if (p + len > limit) return nullptr;
-  *result = Slice(p, len);
-  return p + len;
-}
-
-bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
-  uint32_t len;
-  if (GetVarint32(input, &len) &&
-      input->size() >= len) {
-    *result = Slice(input->data(), len);
-    input->remove_prefix(len);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-Slice GetLengthPrefixedSlice(const char* data) {
-  uint32_t len;
-  const char* p = data;
-  p = GetVarint32Ptr(p, p + 5, &len);  // +5: we assume "p" is not corrupted
-  return Slice(p, len);
-}
-
-Slice GetSliceUntil(Slice* slice, char delimiter) {
-  uint32_t len;
-  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
-    // nothing
-  }
-
-  Slice ret(slice->data(), len);
-  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
-  return ret;
-}
-
 void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
                      uint32_t bits, uint64_t value) {
   assert((offset + bits + 7)/8 <= dstlen);
@@ -320,14 +166,4 @@ void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
          BitStreamGetInt(dst, offset, bits));
 }
 
-uint64_t BitStreamGetInt(const std::string* src, size_t offset,
-                         uint32_t bits) {
-  return BitStreamGetInt(src->data(), src->size(), offset, bits);
-}
-
-uint64_t BitStreamGetInt(const Slice* src, size_t offset,
-                         uint32_t bits) {
-  return BitStreamGetInt(src->data(), src->size(), offset, bits);
-}
-
 }  // namespace rocksdb
diff --git a/util/coding.h b/util/coding.h
index c6a6b203d..8ffba51cb 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -13,6 +13,7 @@
 // * Strings are encoded prefixed by their length in varint format
 
 #pragma once
+#include <algorithm>
 #include <stdint.h>
 #include <string.h>
 #include <string>
@@ -40,6 +41,7 @@ extern void PutLengthPrefixedSliceParts(std::string* dst,
 extern bool GetVarint32(Slice* input, uint32_t* value);
 extern bool GetVarint64(Slice* input, uint64_t* value);
 extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+// This function assumes data is well-formed.
 extern Slice GetLengthPrefixedSlice(const char* data);
 
 extern Slice GetSliceUntil(Slice* slice, char delimiter);
@@ -138,4 +140,155 @@ extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
 extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
                                 uint32_t bits);
 
+// -- Implementation of the functions declared above
+inline void EncodeFixed32(char* buf, uint32_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+#endif
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+  buf[4] = (value >> 32) & 0xff;
+  buf[5] = (value >> 40) & 0xff;
+  buf[6] = (value >> 48) & 0xff;
+  buf[7] = (value >> 56) & 0xff;
+#endif
+}
+
+inline void PutFixed32(std::string* dst, uint32_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed32(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+inline void PutFixed64(std::string* dst, uint64_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed64(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+inline void PutVarint32(std::string* dst, uint32_t v) {
+  char buf[5];
+  char* ptr = EncodeVarint32(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+inline char* EncodeVarint64(char* dst, uint64_t v) {
+  static const unsigned int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+inline void PutVarint64(std::string* dst, uint64_t v) {
+  char buf[10];
+  char* ptr = EncodeVarint64(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+  PutVarint32(dst, value.size());
+  dst->append(value.data(), value.size());
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts) {
+  uint32_t total_bytes = 0;
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    total_bytes += slice_parts.parts[i].size();
+  }
+  PutVarint32(dst, total_bytes);
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+  }
+}
+
+inline int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+inline bool GetVarint32(Slice* input, uint32_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint32Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+inline bool GetVarint64(Slice* input, uint64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+  uint32_t len = 0;
+  if (GetVarint32(input, &len) && input->size() >= len) {
+    *result = Slice(input->data(), len);
+    input->remove_prefix(len);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len = 0;
+  // +5: we assume "data" is not corrupted
+  auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
+  return Slice(p, len);
+}
+
+inline Slice GetSliceUntil(Slice* slice, char delimiter) {
+  uint32_t len = 0;
+  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+    // nothing
+  }
+
+  Slice ret(slice->data(), len);
+  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+  return ret;
+}
+
+inline uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+                                uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+inline uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+                                uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
 }  // namespace rocksdb
diff --git a/util/coding_test.cc b/util/coding_test.cc
index fb0613238..ed542d6bf 100644
--- a/util/coding_test.cc
+++ b/util/coding_test.cc
@@ -41,7 +41,7 @@ TEST(Coding, Fixed64) {
   const char* p = s.data();
   for (int power = 0; power <= 63; power++) {
     uint64_t v = static_cast<uint64_t>(1) << power;
-    uint64_t actual;
+    uint64_t actual = 0;
     actual = DecodeFixed64(p);
     ASSERT_EQ(v-1, actual);
     p += sizeof(uint64_t);
@@ -90,7 +90,7 @@ TEST(Coding, Varint32) {
   const char* limit = p + s.size();
   for (uint32_t i = 0; i < (32 * 32); i++) {
     uint32_t expected = (i / 32) << (i % 32);
-    uint32_t actual;
+    uint32_t actual = 0;
     const char* start = p;
     p = GetVarint32Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
@@ -125,7 +125,7 @@ TEST(Coding, Varint64) {
   const char* limit = p + s.size();
   for (unsigned int i = 0; i < values.size(); i++) {
     ASSERT_TRUE(p < limit);
-    uint64_t actual;
+    uint64_t actual = 0;
     const char* start = p;
     p = GetVarint64Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
new file mode 100644
index 000000000..94df660ef
--- /dev/null
+++ b/util/dynamic_bloom.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "dynamic_bloom.h"
+
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+static uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+}
+
+DynamicBloom::DynamicBloom(uint32_t total_bits,
+                           uint32_t (*hash_func)(const Slice& key),
+                           uint32_t num_probes)
+    : hash_func_(hash_func),
+      kTotalBits((total_bits + 7) / 8 * 8),
+      kNumProbes(num_probes) {
+  assert(hash_func_);
+  assert(kNumProbes > 0);
+  assert(kTotalBits > 0);
+  data_.reset(new unsigned char[kTotalBits / 8]());
+}
+
+DynamicBloom::DynamicBloom(uint32_t total_bits,
+                           uint32_t num_probes)
+    : DynamicBloom(total_bits, &BloomHash, num_probes) {
+}
+
+}  // rocksdb
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
new file mode 100644
index 000000000..0851becbf
--- /dev/null
+++ b/util/dynamic_bloom.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+namespace rocksdb {
+
+class Slice;
+
+class DynamicBloom {
+ public:
+  // total_bits: fixed total bits for the bloom
+  // hash_func:  customized hash function
+  // num_probes: number of hash probes for a single key
+  DynamicBloom(uint32_t total_bits,
+               uint32_t (*hash_func)(const Slice& key),
+               uint32_t num_probes = 6);
+
+  explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6);
+
+  // Assuming single threaded access to this function.
+  void Add(const Slice& key);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContain(const Slice& key);
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash);
+
+ private:
+  uint32_t (*hash_func_)(const Slice& key);
+  const uint32_t kTotalBits;
+  const uint32_t kNumProbes;
+  std::unique_ptr<unsigned char[]> data_;
+};
+
+inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
+
+inline bool DynamicBloom::MayContain(const Slice& key) {
+  return (MayContainHash(hash_func_(key)));
+}
+
+inline bool DynamicBloom::MayContainHash(uint32_t h) {
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  for (uint32_t i = 0; i < kNumProbes; i++) {
+    const uint32_t bitpos = h % kTotalBits;
+    if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+      return false;
+    }
+    h += delta;
+  }
+  return true;
+}
+
+inline void DynamicBloom::AddHash(uint32_t h) {
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  for (uint32_t i = 0; i < kNumProbes; i++) {
+    const uint32_t bitpos = h % kTotalBits;
+    data_[bitpos / 8] |= (1 << (bitpos % 8));
+    h += delta;
+  }
+}
+
+}  // rocksdb
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
new file mode 100644
index 000000000..58f05ae50
--- /dev/null
+++ b/util/dynamic_bloom_test.cc
@@ -0,0 +1,113 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <gflags/gflags.h>
+
+#include "dynamic_bloom.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+DEFINE_int32(bits_per_key, 10, "");
+DEFINE_int32(num_probes, 6, "");
+
+namespace rocksdb {
+
+static Slice Key(int i, char* buffer) {
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+class DynamicBloomTest {
+};
+
+TEST(DynamicBloomTest, EmptyFilter) {
+  DynamicBloom bloom(100, 2);
+  ASSERT_TRUE(! bloom.MayContain("hello"));
+  ASSERT_TRUE(! bloom.MayContain("world"));
+}
+
+TEST(DynamicBloomTest, Small) {
+  DynamicBloom bloom(100, 2);
+  bloom.Add("hello");
+  bloom.Add("world");
+  ASSERT_TRUE(bloom.MayContain("hello"));
+  ASSERT_TRUE(bloom.MayContain("world"));
+  ASSERT_TRUE(! bloom.MayContain("x"));
+  ASSERT_TRUE(! bloom.MayContain("foo"));
+}
+
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+TEST(DynamicBloomTest, VaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  fprintf(stderr, "bits_per_key: %d  num_probes: %d\n",
+          FLAGS_bits_per_key, FLAGS_num_probes);
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    uint32_t bloom_bits = std::max(length * FLAGS_bits_per_key, 64);
+    DynamicBloom bloom(bloom_bits, FLAGS_num_probes);
+    for (int i = 0; i < length; i++) {
+      bloom.Add(Key(i, buffer));
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+    }
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer)))
+        << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    double rate = result / 10000.0;
+
+    fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; \n",
+            rate*100.0, length);
+
+    //ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125)
+      mediocre_filters++;  // Allowed, but not too often
+    else
+      good_filters++;
+  }
+
+  fprintf(stderr, "Filters: %d good, %d mediocre\n",
+          good_filters, mediocre_filters);
+
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+// Different bits-per-byte
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 638b6c906..b53cd0103 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -306,7 +306,13 @@ class PosixMmapReadableFile: public RandomAccessFile {
     assert(options.use_mmap_reads);
     assert(options.use_os_buffer);
   }
-  virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
+  virtual ~PosixMmapReadableFile() {
+    int ret = munmap(mmapped_region_, length_);
+    if (ret != 0) {
+      fprintf(stdout, "failed to munmap %p length %zu \n",
+              mmapped_region_, length_);
+    }
+  }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
new file mode 100644
index 000000000..83f0f3d5a
--- /dev/null
+++ b/util/hash_linklist_rep.cc
@@ -0,0 +1,470 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "util/hash_linklist_rep.h"
+
+#include "rocksdb/memtablerep.h"
+#include "util/arena.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "port/port.h"
+#include "port/atomic_pointer.h"
+#include "util/murmurhash.h"
+#include "db/memtable.h"
+#include "db/skiplist.h"
+
+namespace rocksdb {
+namespace {
+
+typedef const char* Key;
+
+struct Node {
+  explicit Node(const Key& k) :
+      key(k) {
+  }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next() {
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_.Acquire_Load());
+  }
+  void SetNext(Node* x) {
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_.Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next() {
+    return reinterpret_cast<Node*>(next_.NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(Node* x) {
+    next_.NoBarrier_Store(x);
+  }
+
+private:
+  port::AtomicPointer next_;
+};
+
+class HashLinkListRep : public MemTableRep {
+ public:
+  HashLinkListRep(MemTableRep::KeyComparator& compare, Arena* arena,
+                  const SliceTransform* transform, size_t bucket_size);
+
+  virtual void Insert(const char* key) override;
+
+  virtual bool Contains(const char* key) const override;
+
+  virtual size_t ApproximateMemoryUsage() override;
+
+  virtual ~HashLinkListRep();
+
+  virtual MemTableRep::Iterator* GetIterator() override;
+
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
+
+  virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
+      override;
+
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
+ private:
+  friend class DynamicIterator;
+  typedef SkipList<const char*, MemTableRep::KeyComparator&> FullList;
+
+  size_t bucket_size_;
+
+  // Maps slices (which are transformed user keys) to buckets of keys sharing
+  // the same transform.
+  port::AtomicPointer* buckets_;
+
+  // The user-supplied transform whose domain is the user keys.
+  const SliceTransform* transform_;
+
+  MemTableRep::KeyComparator& compare_;
+  // immutable after construction
+  Arena* const arena_;
+
+  bool BucketContains(Node* head, const Slice& key) const;
+
+  Slice GetPrefix(const Slice& internal_key) const {
+    return transform_->Transform(ExtractUserKey(internal_key));
+  }
+
+  size_t GetHash(const Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+  }
+
+  Node* GetBucket(size_t i) const {
+    return static_cast<Node*>(buckets_[i].Acquire_Load());
+  }
+
+  Node* GetBucket(const Slice& slice) const {
+    return GetBucket(GetHash(slice));
+  }
+
+  Node* NewNode(const Key& key) {
+    char* mem = arena_->AllocateAligned(sizeof(Node));
+    return new (mem) Node(key);
+  }
+
+  bool Equal(const Slice& a, const Key& b) const {
+    return (compare_(b, a) == 0);
+  }
+
+
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, internal_key) < 0);
+  }
+
+  bool KeyIsAfterNode(const Key& key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, key) < 0);
+  }
+
+
+  Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
+
+  class FullListIterator : public MemTableRep::Iterator {
+   public:
+    explicit FullListIterator(FullList* list)
+      : iter_(list), full_list_(list) {}
+
+    virtual ~FullListIterator() {
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return iter_.Valid();
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      assert(Valid());
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+      const char* encoded_key =
+          (memtable_key != nullptr) ?
+              memtable_key : EncodeKey(&tmp_, internal_key);
+      iter_.Seek(encoded_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      iter_.SeekToFirst();
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      iter_.SeekToLast();
+    }
+   private:
+    FullList::Iterator iter_;
+    // To destruct with the iterator.
+    std::unique_ptr<FullList> full_list_;
+    std::string tmp_;       // For passing to EncodeKey
+  };
+
+  class Iterator : public MemTableRep::Iterator {
+   public:
+    explicit Iterator(const HashLinkListRep* const hash_link_list_rep,
+                      Node* head) :
+        hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) {
+    }
+
+    virtual ~Iterator() {
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return node_ != nullptr;
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return node_->key;
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      node_ = node_->Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+      node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
+                                                              internal_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+   protected:
+    void Reset(Node* head) {
+      head_ = head;
+      node_ = nullptr;
+    }
+   private:
+    friend class HashLinkListRep;
+    const HashLinkListRep* const hash_link_list_rep_;
+    Node* head_;
+    Node* node_;
+    std::string tmp_;       // For passing to EncodeKey
+
+    virtual void SeekToHead() {
+      node_ = head_;
+    }
+  };
+
+  class DynamicIterator : public HashLinkListRep::Iterator {
+   public:
+    explicit DynamicIterator(HashLinkListRep& memtable_rep)
+      : HashLinkListRep::Iterator(&memtable_rep, nullptr),
+        memtable_rep_(memtable_rep) {}
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& k, const char* memtable_key) {
+      auto transformed = memtable_rep_.GetPrefix(k);
+      Reset(memtable_rep_.GetBucket(transformed));
+      HashLinkListRep::Iterator::Seek(k, memtable_key);
+    }
+
+   private:
+    // the underlying memtable
+    const HashLinkListRep& memtable_rep_;
+  };
+
+  class EmptyIterator : public MemTableRep::Iterator {
+    // This is used when there wasn't a bucket. It is cheaper than
+    // instantiating an empty bucket over which to iterate.
+   public:
+    EmptyIterator() { }
+    virtual bool Valid() const {
+      return false;
+    }
+    virtual const char* key() const {
+      assert(false);
+      return nullptr;
+    }
+    virtual void Next() { }
+    virtual void Prev() { }
+    virtual void Seek(const Slice& user_key, const char* memtable_key) { }
+    virtual void SeekToFirst() { }
+    virtual void SeekToLast() { }
+   private:
+  };
+};
+
+HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare,
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size)
+  : bucket_size_(bucket_size),
+    transform_(transform),
+    compare_(compare),
+    arena_(arena) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(port::AtomicPointer) * bucket_size);
+
+  buckets_ = new (mem) port::AtomicPointer[bucket_size];
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    buckets_[i].NoBarrier_Store(nullptr);
+  }
+}
+
+HashLinkListRep::~HashLinkListRep() {
+}
+
+void HashLinkListRep::Insert(const char* key) {
+  assert(!Contains(key));
+  Slice internal_key = GetLengthPrefixedSlice(key);
+  auto transformed = GetPrefix(internal_key);
+  auto& bucket = buckets_[GetHash(transformed)];
+  Node* head = static_cast<Node*>(bucket.Acquire_Load());
+
+  if (!head) {
+    Node* x = NewNode(key);
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(nullptr);
+    bucket.Release_Store(static_cast<void*>(x));
+    return;
+  }
+
+  Node* cur = head;
+  Node* prev = nullptr;
+  while (true) {
+    if (cur == nullptr) {
+      break;
+    }
+    Node* next = cur->Next();
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((cur == head) || (next == nullptr) ||
+           KeyIsAfterNode(next->key, cur));
+    if (KeyIsAfterNode(internal_key, cur)) {
+      // Keep searching in this list
+      prev = cur;
+      cur = next;
+    } else {
+      break;
+    }
+  }
+
+  // Our data structure does not allow duplicate insertion
+  assert(cur == nullptr || !Equal(key, cur->key));
+
+  Node* x = NewNode(key);
+
+  // NoBarrier_SetNext() suffices since we will add a barrier when
+  // we publish a pointer to "x" in prev[i].
+  x->NoBarrier_SetNext(cur);
+
+  if (prev) {
+    prev->SetNext(x);
+  } else {
+    bucket.Release_Store(static_cast<void*>(x));
+  }
+}
+
+bool HashLinkListRep::Contains(const char* key) const {
+  Slice internal_key = GetLengthPrefixedSlice(key);
+
+  auto transformed = GetPrefix(internal_key);
+  auto bucket = GetBucket(transformed);
+  if (bucket == nullptr) {
+    return false;
+  }
+  return BucketContains(bucket, internal_key);
+}
+
+size_t HashLinkListRep::ApproximateMemoryUsage() {
+  // Memory is always allocated from the arena.
+  return 0;
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetIterator() {
+  auto list = new FullList(compare_, arena_);
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    auto bucket = GetBucket(i);
+    if (bucket != nullptr) {
+      Iterator itr(this, bucket);
+      for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
+        list->Insert(itr.key());
+      }
+    }
+  }
+  return new FullListIterator(list);
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator(
+  const Slice& prefix) {
+  auto bucket = GetBucket(prefix);
+  if (bucket == nullptr) {
+    return new EmptyIterator();
+  }
+  return new Iterator(this, bucket);
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) {
+  return GetPrefixIterator(transform_->Transform(slice));
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
+}
+
+bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const {
+  Node* x = FindGreaterOrEqualInBucket(head, user_key);
+  return (x != nullptr && Equal(user_key, x->key));
+}
+
+Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
+                                                  const Slice& key) const {
+  Node* x = head;
+  while (true) {
+    if (x == nullptr) {
+      return x;
+    }
+    Node* next = x->Next();
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, x)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      break;
+    }
+  }
+  return x;
+}
+
+} // anon namespace
+
+MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new HashLinkListRep(compare, arena, transform_, bucket_count_);
+}
+
+MemTableRepFactory* NewHashLinkListRepFactory(
+    const SliceTransform* transform, size_t bucket_count) {
+  return new HashLinkListRepFactory(transform, bucket_count);
+}
+
+} // namespace rocksdb
diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h
new file mode 100644
index 000000000..efa9d8f2e
--- /dev/null
+++ b/util/hash_linklist_rep.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+
+namespace rocksdb {
+
+class HashLinkListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashLinkListRepFactory(
+    const SliceTransform* transform,
+    size_t bucket_count)
+      : transform_(transform),
+        bucket_count_(bucket_count) { }
+
+  virtual ~HashLinkListRepFactory() { delete transform_; }
+
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
+                                         Arena* arena) override;
+
+  virtual const char* Name() const override {
+    return "HashLinkListRepFactory";
+  }
+
+  const SliceTransform* GetTransform() { return transform_; }
+
+ private:
+  const SliceTransform* transform_;
+  const size_t bucket_count_;
+};
+
+}
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index e9fe1573a..aa070bc8b 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -7,12 +7,13 @@
 #include "util/hash_skiplist_rep.h"
 
 #include "rocksdb/memtablerep.h"
-#include "rocksdb/arena.h"
+#include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "port/port.h"
 #include "port/atomic_pointer.h"
 #include "util/murmurhash.h"
+#include "db/memtable.h"
 #include "db/skiplist.h"
 
 namespace rocksdb {
@@ -21,7 +22,8 @@ namespace {
 class HashSkipListRep : public MemTableRep {
  public:
   HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform* transform, size_t bucket_size);
+                  const SliceTransform* transform, size_t bucket_size,
+                  int32_t skiplist_height, int32_t skiplist_branching_factor);
 
   virtual void Insert(const char* key) override;
 
@@ -46,6 +48,9 @@ class HashSkipListRep : public MemTableRep {
 
   size_t bucket_size_;
 
+  const int32_t skiplist_height_;
+  const int32_t skiplist_branching_factor_;
+
   // Maps slices (which are transformed user keys) to buckets of keys sharing
   // the same transform.
   port::AtomicPointer* buckets_;
@@ -112,9 +117,12 @@ class HashSkipListRep : public MemTableRep {
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) {
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
       if (list_ != nullptr) {
-        iter_.Seek(target);
+        const char* encoded_key =
+            (memtable_key != nullptr) ?
+                memtable_key : EncodeKey(&tmp_, internal_key);
+        iter_.Seek(encoded_key);
       }
     }
 
@@ -151,6 +159,7 @@ class HashSkipListRep : public MemTableRep {
     // here we track if we own list_. If we own it, we are also
     // responsible for it's cleaning. This is a poor man's shared_ptr
     bool own_list_;
+    std::string tmp_;       // For passing to EncodeKey
   };
 
   class DynamicIterator : public HashSkipListRep::Iterator {
@@ -160,11 +169,10 @@ class HashSkipListRep : public MemTableRep {
         memtable_rep_(memtable_rep) {}
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) {
-      auto transformed = memtable_rep_.transform_->Transform(
-        memtable_rep_.UserKey(target));
+    virtual void Seek(const Slice& k, const char* memtable_key) {
+      auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
       Reset(memtable_rep_.GetBucket(transformed));
-      HashSkipListRep::Iterator::Seek(target);
+      HashSkipListRep::Iterator::Seek(k, memtable_key);
     }
 
     // Position at the first entry in collection.
@@ -201,7 +209,8 @@ class HashSkipListRep : public MemTableRep {
     }
     virtual void Next() { }
     virtual void Prev() { }
-    virtual void Seek(const char* target) { }
+    virtual void Seek(const Slice& internal_key,
+                      const char* memtable_key) { }
     virtual void SeekToFirst() { }
     virtual void SeekToLast() { }
    private:
@@ -210,8 +219,11 @@ class HashSkipListRep : public MemTableRep {
 
 HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
                                  Arena* arena, const SliceTransform* transform,
-                                 size_t bucket_size)
+                                 size_t bucket_size, int32_t skiplist_height,
+                                 int32_t skiplist_branching_factor)
     : bucket_size_(bucket_size),
+      skiplist_height_(skiplist_height),
+      skiplist_branching_factor_(skiplist_branching_factor),
       transform_(transform),
       compare_(compare),
       arena_(arena) {
@@ -232,7 +244,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
   auto bucket = GetBucket(hash);
   if (bucket == nullptr) {
     auto addr = arena_->AllocateAligned(sizeof(Bucket));
-    bucket = new (addr) Bucket(compare_, arena_);
+    bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
+                               skiplist_branching_factor_);
     buckets_[hash].Release_Store(static_cast<void*>(bucket));
   }
   return bucket;
@@ -292,12 +305,15 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
 
 MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
     MemTableRep::KeyComparator& compare, Arena* arena) {
-  return new HashSkipListRep(compare, arena, transform_, bucket_count_);
+  return new HashSkipListRep(compare, arena, transform_, bucket_count_,
+                             skiplist_height_, skiplist_branching_factor_);
 }
 
 MemTableRepFactory* NewHashSkipListRepFactory(
-    const SliceTransform* transform, size_t bucket_count) {
-  return new HashSkipListRepFactory(transform, bucket_count);
+    const SliceTransform* transform, size_t bucket_count,
+    int32_t skiplist_height, int32_t skiplist_branching_factor) {
+  return new HashSkipListRepFactory(transform, bucket_count,
+                                    skiplist_height, skiplist_branching_factor);
 }
 
 } // namespace rocksdb
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
index 7b8414c88..1ea844eda 100644
--- a/util/hash_skiplist_rep.h
+++ b/util/hash_skiplist_rep.h
@@ -14,10 +14,15 @@ namespace rocksdb {
 
 class HashSkipListRepFactory : public MemTableRepFactory {
  public:
-  explicit HashSkipListRepFactory(const SliceTransform* transform,
-      size_t bucket_count = 1000000)
-    : transform_(transform),
-      bucket_count_(bucket_count) { }
+  explicit HashSkipListRepFactory(
+    const SliceTransform* transform,
+    size_t bucket_count,
+    int32_t skiplist_height,
+    int32_t skiplist_branching_factor)
+      : transform_(transform),
+        bucket_count_(bucket_count),
+        skiplist_height_(skiplist_height),
+        skiplist_branching_factor_(skiplist_branching_factor) { }
 
   virtual ~HashSkipListRepFactory() { delete transform_; }
 
@@ -33,6 +38,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
  private:
   const SliceTransform* transform_;
   const size_t bucket_count_;
+  const int32_t skiplist_height_;
+  const int32_t skiplist_branching_factor_;
 };
 
 }
diff --git a/util/options.cc b/util/options.cc
index 212dc4653..2a2807155 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -16,10 +16,11 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
-#include "rocksdb/merge_operator.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based_table_factory.h"
 
@@ -73,6 +74,9 @@ ColumnFamilyOptions::ColumnFamilyOptions()
         std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
       inplace_update_support(false),
       inplace_update_num_locks(10000),
+      inplace_callback(nullptr),
+      memtable_prefix_bloom_bits(0),
+      memtable_prefix_bloom_probes(6),
       max_successive_merges(0) {
   assert(memtable_factory.get() != nullptr);
 }
@@ -131,6 +135,9 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
       table_properties_collectors(options.table_properties_collectors),
       inplace_update_support(options.inplace_update_support),
       inplace_update_num_locks(options.inplace_update_num_locks),
+      inplace_callback(options.inplace_callback),
+      memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
+      memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
       max_successive_merges(options.max_successive_merges) {
   assert(memtable_factory.get() != nullptr);
 }
@@ -396,6 +403,11 @@ Options::Dump(Logger* log) const
         inplace_update_support);
     Log(log, "                Options.inplace_update_num_locks: %zd",
         inplace_update_num_locks);
+    // TODO: easier config for bloom (maybe based on avg key/value size)
+    Log(log, "              Options.memtable_prefix_bloom_bits: %d",
+        memtable_prefix_bloom_bits);
+    Log(log, "            Options.memtable_prefix_bloom_probes: %d",
+        memtable_prefix_bloom_probes);
     Log(log, "                   Options.max_successive_merges: %zd",
         max_successive_merges);
 }   // Options::Dump
diff --git a/util/perf_context.cc b/util/perf_context.cc
index 1e8ddfb5e..6833f6836 100644
--- a/util/perf_context.cc
+++ b/util/perf_context.cc
@@ -22,7 +22,20 @@ void PerfContext::Reset() {
   block_decompress_time = 0;
   internal_key_skipped_count = 0;
   internal_delete_skipped_count = 0;
-  wal_write_time = 0;
+  write_wal_time = 0;
+
+  get_snapshot_time = 0;
+  get_from_memtable_time = 0;
+  get_from_memtable_count = 0;
+  get_post_process_time = 0;
+  get_from_output_files_time = 0;
+  seek_child_seek_time = 0;
+  seek_child_seek_count = 0;
+  seek_min_heap_time = 0;
+  seek_internal_seek_time = 0;
+  find_next_user_entry_time = 0;
+  write_pre_and_post_process_time = 0;
+  write_memtable_time = 0;
 }
 
 __thread PerfContext perf_context;
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index a5b072ad1..6f1fb1a15 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -70,8 +70,13 @@ public:
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) override {
-      iter_.Seek(target);
+    virtual void Seek(const Slice& user_key, const char* memtable_key)
+        override {
+      if (memtable_key != nullptr) {
+        iter_.Seek(memtable_key);
+      } else {
+        iter_.Seek(EncodeKey(&tmp_, user_key));
+      }
     }
 
     // Position at the first entry in list.
@@ -85,6 +90,8 @@ public:
     virtual void SeekToLast() override {
       iter_.SeekToLast();
     }
+   protected:
+    std::string tmp_;       // For passing to EncodeKey
   };
 
   // Unhide default implementations of GetIterator
diff --git a/util/testutil.h b/util/testutil.h
index c73210fec..4fc8c0f5b 100644
--- a/util/testutil.h
+++ b/util/testutil.h
@@ -8,6 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <string>
+#include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "util/random.h"
@@ -51,5 +53,28 @@ class ErrorEnv : public EnvWrapper {
   }
 };
 
+// An internal comparator that just forward comparing results from the
+// user comparator in it. Can be used to test entities that have no dependency
+// on internal key structure but consumes InternalKeyComparator, like
+// BlockBasedTable.
+class PlainInternalKeyComparator : public InternalKeyComparator {
+ public:
+  explicit PlainInternalKeyComparator(const Comparator* c)
+      : InternalKeyComparator(c) {}
+
+  virtual ~PlainInternalKeyComparator() {}
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    return user_comparator()->Compare(a, b);
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {
+    user_comparator()->FindShortestSeparator(start, limit);
+  }
+  virtual void FindShortSuccessor(std::string* key) const override {
+    user_comparator()->FindShortSuccessor(key);
+  }
+};
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index 87fae4bc7..4b8b3d552 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -11,7 +11,8 @@
 #include <algorithm>
 #include <type_traits>
 
-#include "rocksdb/arena.h"
+#include "util/arena.h"
+#include "db/memtable.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/stl_wrappers.h"
@@ -45,6 +46,7 @@ class VectorRep : public MemTableRep {
     std::shared_ptr<std::vector<const char*>> bucket_;
     typename std::vector<const char*>::const_iterator mutable cit_;
     const KeyComparator& compare_;
+    std::string tmp_;       // For passing to EncodeKey
     bool mutable sorted_;
     void DoSort() const;
    public:
@@ -73,7 +75,7 @@ class VectorRep : public MemTableRep {
     virtual void Prev() override;
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const char* target) override;
+    virtual void Seek(const Slice& user_key, const char* memtable_key) override;
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
@@ -200,12 +202,15 @@ void VectorRep::Iterator::Prev() {
 }
 
 // Advance to the first entry with a key >= target
-void VectorRep::Iterator::Seek(const char* target) {
+void VectorRep::Iterator::Seek(const Slice& user_key,
+                               const char* memtable_key) {
   DoSort();
   // Do binary search to find first value not less than the target
+  const char* encoded_key =
+      (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
   cit_ = std::equal_range(bucket_->begin(),
                           bucket_->end(),
-                          target,
+                          encoded_key,
                           [this] (const char* a, const char* b) {
                             return compare_(a, b) < 0;
                           }).first;