diff --git a/.clang-format b/.clang-format
index a1e9a48e4..7c279811a 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,46 +2,4 @@
 # http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
 BasedOnStyle: Google
-AccessModifierOffset: -1
-ConstructorInitializerIndentWidth: 4
-AlignEscapedNewlinesLeft: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakTemplateDeclarations: true
-AlwaysBreakBeforeMultilineStrings: true
-BreakBeforeBinaryOperators: false
-BreakConstructorInitializersBeforeComma: false
-BinPackParameters: false
-ColumnLimit:     80
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-DerivePointerBinding: true
-ExperimentalAutoDetectBinPacking: true
-IndentCaseLabels: false
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 10
-PenaltyBreakComment: 60
-PenaltyBreakString: 1000
-PenaltyBreakFirstLessLess: 20
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerBindsToType: true
-SpacesBeforeTrailingComments: 2
-Cpp11BracedListStyle: true
-Standard:        Cpp11
-IndentWidth:     2
-TabWidth:        8
-UseTab:          Never
-BreakBeforeBraces: Attach
-IndentFunctionDeclarationAfterType: false
-SpacesInParentheses: false
-SpacesInAngles: false
-SpaceInEmptyParentheses: false
-SpacesInCStyleCastParentheses: false
-SpaceAfterControlStatementKeyword: true
-SpaceBeforeAssignmentOperators: true
-ContinuationIndentWidth: 4
 ...
diff --git a/Makefile b/Makefile
index 6a5995b28..ab13ac0d5 100644
--- a/Makefile
+++ b/Makefile
@@ -128,19 +128,21 @@ $(SHARED2): $(SHARED3)
 	ln -fs $(SHARED3) $(SHARED2)
 endif
 
-$(SHARED3):
-	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS)
+$(SHARED3): $(LIBOBJECTS)
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@
 
 endif  # PLATFORM_SHARED_EXT
 
 all: $(LIBRARY) $(PROGRAMS)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
-	release tags valgrind_check whitebox_crash_test
+	release tags valgrind_check whitebox_crash_test format
 
+# Will also generate shared libraries. 
 release:
 	$(MAKE) clean
-	OPT=-DNDEBUG $(MAKE) -j32
+	OPT=-DNDEBUG $(MAKE) all -j32
+	OPT=-DNDEBUG $(MAKE) $(SHARED) -j32
 
 coverage:
 	$(MAKE) clean
@@ -197,6 +199,9 @@ tags:
 	ctags * -R
 	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
 
+format:
+	build_tools/format-diff.sh
+
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
@@ -415,6 +420,12 @@ DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
 
 depend: $(DEPFILES)
 
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
 ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
 -include $(DEPFILES)
 endif
+endif
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 87c4c871d..8e83ae497 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -81,9 +81,9 @@ PLATFORM_CCFLAGS=
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
 PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
 PLATFORM_SHARED_EXT="so"
-PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
-PLATFORM_SHARED_VERSIONED=true
+PLATFORM_SHARED_VERSIONED=false
 
 # generic port files (working on all platform by #ifdef) go directly in /port
 GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index ae2bb57da..e8c9f090b 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -60,7 +60,7 @@ AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
 RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
 
 CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
 CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
 CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
 
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
new file mode 100755
index 000000000..ceae38192
--- /dev/null
+++ b/build_tools/format-diff.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl http://goo.gl/iUW1u2"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+
+set -e
+
+uncommitted_code=`git diff HEAD`
+
+# If there's no uncommitted changes, we assume user are doing post-commit
+# format check, in which case we'll check the modified lines from latest commit.
+# Otherwise, we'll check format of the uncommitted code only.
+format_last_commit=0
+if [ -z "$uncommitted_code" ]
+then
+  # Check the format of last commit
+  diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+else
+  # Check the format of uncommitted lines,
+  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+fi
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
+echo "Files reformatted!"
+
+# Amend to last commit if user do the post-commit format check
+if [ -z "$uncommitted_code" ]; then
+  echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
+  read to_amend
+
+  if [ "$to_amend" == "y" ]
+  then
+    git commit -a --amend --reuse-message HEAD
+    echo "Amended to last commit"
+  fi
+fi
diff --git a/db/compaction.cc b/db/compaction.cc
new file mode 100644
index 000000000..703e7aeae
--- /dev/null
+++ b/db/compaction.cc
@@ -0,0 +1,214 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
+    : level_(level),
+      out_level_(out_level),
+      max_output_file_size_(target_file_size),
+      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
+      seek_compaction_(seek_compaction),
+      enable_compression_(enable_compression),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  delete edit_;
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  if (input_version_->vset_->options_->compaction_style ==
+      kCompactionStyleUniversal) {
+    return bottommost_level_;
+  }
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    assert(grandparent_index_ + 1 >= grandparents_.size() ||
+           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
+                         grandparents_[grandparent_index_+1]->smallest.Encode())
+                         < 0);
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool value) {
+  for (int i = 0; i < 2; i++) {
+    std::vector<FileMetaData*> v = inputs_[i];
+    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
+      assert(value ? !inputs_[i][j]->being_compacted :
+                      inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = value;
+    }
+  }
+}
+
+// Is this compaction producing files at the bottommost level?
+void Compaction::SetupBottomMostLevel(bool isManual) {
+  if (input_version_->vset_->options_->compaction_style  ==
+         kCompactionStyleUniversal) {
+    // If universal compaction style is used and manual
+    // compaction is occuring, then we are guaranteed that
+    // all files will be picked in a single compaction
+    // run. We can safely set bottommost_level_ = true.
+    // If it is not manual compaction, then bottommost_level_
+    // is already set when the Compaction was created.
+    if (isManual) {
+      bottommost_level_ = true;
+    }
+    return;
+  }
+  bottommost_level_ = true;
+  int num_levels = input_version_->vset_->NumberLevels();
+  for (int i = output_level() + 1; i < num_levels; i++) {
+    if (input_version_->NumLevelFiles(i) > 0) {
+      bottommost_level_ = false;
+      break;
+    }
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+    input_version_ = nullptr;
+  }
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  input_version_->ResetNextCompactionIndex(level_);
+}
+
+static void InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
+  int write = 0;
+  for (unsigned int i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret = snprintf(output + write, sz, "%lu(%lu) ",
+        (unsigned long)files.at(i)->number,
+        (unsigned long)files.at(i)->file_size);
+    if (ret < 0 || ret >= sz)
+      break;
+    write += ret;
+  }
+}
+
+void Compaction::Summary(char* output, int len) {
+  int write = snprintf(output, len,
+      "Base version %lu Base level %d, seek compaction:%d, inputs:",
+      (unsigned long)input_version_->GetVersionNumber(),
+      level_,
+      seek_compaction_);
+  if (write < 0 || write > len) {
+    return;
+  }
+
+  char level_low_summary[100];
+  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
+  char level_up_summary[100];
+  if (inputs_[1].size()) {
+    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
+  } else {
+    level_up_summary[0] = '\0';
+  }
+
+  snprintf(output + write, len - write, "[%s],[%s]",
+      level_low_summary, level_up_summary);
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction.h b/db/compaction.h
new file mode 100644
index 000000000..5e696a053
--- /dev/null
+++ b/db/compaction.h
@@ -0,0 +1,134 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+class Version;
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+  friend class CompactionPicker;
+  friend class UniversalCompactionPicker;
+  friend class LevelCompactionPicker;
+
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t maxGrandParentOverlapBytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
new file mode 100644
index 000000000..cfa3770d7
--- /dev/null
+++ b/db/compaction_picker.cc
@@ -0,0 +1,847 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_picker.h"
+#include "util/statistics.h"
+
+namespace rocksdb {
+
+namespace {
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+}  // anonymous namespace
+
+CompactionPicker::CompactionPicker(const Options* options,
+                                   const InternalKeyComparator* icmp)
+    : compactions_in_progress_(options->num_levels),
+      options_(options),
+      num_levels_(options->num_levels),
+      icmp_(icmp) {
+  Init();
+}
+
+void CompactionPicker::ReduceNumberOfLevels(int new_levels) {
+  num_levels_ = new_levels;
+  Init();
+}
+
+void CompactionPicker::Init() {
+  max_file_size_.reset(new uint64_t[NumberLevels()]);
+  level_max_bytes_.reset(new uint64_t[NumberLevels()]);
+  int target_file_size_multiplier = options_->target_file_size_multiplier;
+  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
+  for (int i = 0; i < NumberLevels(); i++) {
+    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
+      max_file_size_[i] = ULLONG_MAX;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    } else if (i > 1) {
+      max_file_size_[i] = max_file_size_[i - 1] * target_file_size_multiplier;
+      level_max_bytes_[i] =
+          level_max_bytes_[i - 1] * max_bytes_multiplier *
+          options_->max_bytes_for_level_multiplier_additional[i - 1];
+    } else {
+      max_file_size_[i] = options_->target_file_size_base;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    }
+  }
+}
+
+CompactionPicker::~CompactionPicker() {}
+
+void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
+    uint64_t total = 0;
+    for (auto c : compactions_in_progress_[level]) {
+      assert(c->level() == level);
+      for (int i = 0; i < c->num_input_files(0); i++) {
+        total += c->input(0,i)->file_size;
+      }
+    }
+    sizes[level] = total;
+  }
+}
+
+// Clear all files to indicate that they are not being compacted
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+  c->MarkFilesBeingCompacted(false);
+  compactions_in_progress_[c->level()].erase(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return max_file_size_[level];
+}
+
+uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->max_grandparent_overlap_factor;
+  return result;
+}
+
+double CompactionPicker::MaxBytesForLevel(int level) {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return level_max_bytes_[level];
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
+                                InternalKey* smallest, InternalKey* largest) {
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (i == 0) {
+      *smallest = f->smallest;
+      *largest = f->largest;
+    } else {
+      if (icmp_->Compare(f->smallest, *smallest) < 0) {
+        *smallest = f->smallest;
+      }
+      if (icmp_->Compare(f->largest, *largest) > 0) {
+        *largest = f->largest;
+      }
+    }
+  }
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
+                                const std::vector<FileMetaData*>& inputs2,
+                                InternalKey* smallest, InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
+bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
+  // If inputs are empty then there is nothing to expand.
+  if (!c || c->inputs_[0].empty()) {
+    return true;
+  }
+
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (c->level() == 0) {
+    return true;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Keep expanding c->inputs_[0] until we are sure that there is a
+  // "clean cut" boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = c->inputs_[0].size();
+    GetRange(c->inputs_[0], &smallest, &largest);
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
+  } while(c->inputs_[0].size() > old_size);
+
+  // Get the new range
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  int parent_index = -1;
+  if (FilesInCompaction(c->inputs_[0]) ||
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                               &parent_index))) {
+    c->inputs_[0].clear();
+    c->inputs_[1].clear();
+    return false;
+  }
+  return true;
+}
+
+uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->expanded_compaction_factor;
+  return result;
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
+  for (unsigned int i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::ParentRangeInCompaction(Version* version,
+                                               const InternalKey* smallest,
+                                               const InternalKey* largest,
+                                               int level, int* parent_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level + 1 < NumberLevels());
+
+  version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                *parent_index, parent_index);
+  return FilesInCompaction(inputs);
+}
+
+// Populates the set of inputs from "level+1" that overlap with "level".
+// Will also attempt to expand "level" if that doesn't expand "level+1"
+// or cause "level" to include a file for compaction that has an overlapping
+// user-key with another file.
+void CompactionPicker::SetupOtherInputs(Compaction* c) {
+  // If inputs are empty, then there is nothing to expand.
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
+    return;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_[1]) to include in compaction
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
+
+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!c->inputs_[1].empty()) {
+    std::vector<FileMetaData*> expanded0;
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
+    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+    const uint64_t expanded0_size = TotalFileSize(expanded0);
+    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
+    if (expanded0.size() > c->inputs_[0].size() &&
+        inputs1_size + expanded0_size < limit &&
+        !FilesInCompaction(expanded0) &&
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded0, &new_start, &new_limit);
+      std::vector<FileMetaData*> expanded1;
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
+      if (expanded1.size() == c->inputs_[1].size() &&
+          !FilesInCompaction(expanded1)) {
+        Log(options_->info_log,
+            "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
+            "\n",
+            (unsigned long)level,
+            (unsigned long)(c->inputs_[0].size()),
+            (unsigned long)(c->inputs_[1].size()),
+            (unsigned long)inputs0_size,
+            (unsigned long)inputs1_size,
+            (unsigned long)(expanded0.size()),
+            (unsigned long)(expanded1.size()),
+            (unsigned long)expanded0_size,
+            (unsigned long)inputs1_size);
+        smallest = new_start;
+        largest = new_limit;
+        c->inputs_[0] = expanded0;
+        c->inputs_[1] = expanded1;
+        GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      }
+    }
+  }
+
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < NumberLevels()) {
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
+  }
+}
+
+
+Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
+                                           int output_level,
+                                           const InternalKey* begin,
+                                           const InternalKey* end,
+                                           InternalKey** compaction_end) {
+  std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+  version->GetOverlappingInputs(input_level, begin, end, &inputs);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
+    uint64_t total = 0;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      uint64_t s = inputs[i]->file_size;
+      total += s;
+      if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
+        inputs.resize(i + 1);
+        break;
+      }
+    }
+  }
+  Compaction* c = new Compaction(version, input_level, output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level));
+
+  c->inputs_[0] = inputs;
+  if (ExpandWhileOverlapping(c) == false) {
+    delete c;
+    Log(options_->info_log, "Could not compact due to expansion failure.\n");
+    return nullptr;
+  }
+
+  SetupOtherInputs(c);
+
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
+  // These files that are to be manaully compacted do not trample
+  // upon other files because manual compactions are processed when
+  // the system has a max of 1 background compaction thread.
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(true);
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompaction(Version* version) {
+  Compaction* c = nullptr;
+  int level = -1;
+
+  // Compute the compactions needed. It is better to do it here
+  // and also in LogAndApply(), otherwise the values could be stale.
+  std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
+  SizeBeingCompacted(size_being_compacted);
+  version->Finalize(size_being_compacted);
+
+  // We prefer compactions triggered by too much data in a level over
+  // the compactions triggered by seeks.
+  //
+  // Find the compactions by size on all levels.
+  for (int i = 0; i < NumberLevels() - 1; i++) {
+    assert(i == 0 ||
+           version->compaction_score_[i] <= version->compaction_score_[i - 1]);
+    level = version->compaction_level_[i];
+    if ((version->compaction_score_[i] >= 1)) {
+      c = PickCompactionBySize(version, level, version->compaction_score_[i]);
+      if (ExpandWhileOverlapping(c) == false) {
+        delete c;
+        c = nullptr;
+      } else {
+        break;
+      }
+    }
+  }
+
+  // Find compactions needed by seeks
+  FileMetaData* f = version->file_to_compact_;
+  if (c == nullptr && f != nullptr && !f->being_compacted) {
+
+    level = version->file_to_compact_level_;
+    int parent_index = -1;
+
+    // Only allow one level 0 compaction at a time.
+    // Do not pick this file if its parents at level+1 are being compacted.
+    if (level != 0 || compactions_in_progress_[0].empty()) {
+      if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
+                                   &parent_index)) {
+        c = new Compaction(version, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
+        c->inputs_[0].push_back(f);
+        c->parent_index_ = parent_index;
+        c->input_version_->file_to_compact_ = nullptr;
+        if (ExpandWhileOverlapping(c) == false) {
+          return nullptr;
+        }
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    return nullptr;
+  }
+
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  if (level == 0) {
+    assert(compactions_in_progress_[0].empty());
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
+    // Note that the next call will discard the file we placed in
+    // c->inputs_[0] earlier and replace it with an overlapping set
+    // which will include the picked file.
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
+
+    // If we include more L0 files in the same compaction run it can
+    // cause the 'smallest' and 'largest' key to get extended to a
+    // larger range. So, re-invoke GetRange to get the new key range
+    GetRange(c->inputs_[0], &smallest, &largest);
+    if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                                &c->parent_index_)) {
+      delete c;
+      return nullptr;
+    }
+    assert(!c->inputs_[0].empty());
+  }
+
+  // Setup "level+1" files (inputs_[1])
+  SetupOtherInputs(c);
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(false);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
+                                                        int level,
+                                                        double score) {
+  Compaction* c = nullptr;
+
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (level == 0 && compactions_in_progress_[level].size() == 1) {
+    return nullptr;
+  }
+
+  assert(level >= 0);
+  assert(level + 1 < NumberLevels());
+  c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
+  c->score_ = score;
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
+
+  // record the first file that is not yet compacted
+  int nextIndex = -1;
+
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
+       i < file_size.size(); i++) {
+    int index = file_size[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+
+    // check to verify files are arranged in descending size
+    assert((i == file_size.size() - 1) ||
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    // remember the startIndex for the next call to PickCompaction
+    if (nextIndex == -1) {
+      nextIndex = i;
+    }
+
+    //if (i > Version::number_of_files_to_sort_) {
+    //  Log(options_->info_log, "XXX Looking at index %d", i);
+    //}
+
+    // Do not pick this file if its parents at level+1 are being compacted.
+    // Maybe we can avoid redoing this work in SetupOtherInputs
+    int parent_index = -1;
+    if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
+                                level, &parent_index)) {
+      continue;
+    }
+    c->inputs_[0].push_back(f);
+    c->base_index_ = index;
+    c->parent_index_ = parent_index;
+    break;
+  }
+
+  if (c->inputs_[0].empty()) {
+    delete c;
+    c = nullptr;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  version->next_file_to_compact_by_size_[level] = nextIndex;
+
+  return c;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+//
+Compaction* UniversalCompactionPicker::PickCompaction(Version* version) {
+  int level = 0;
+  double score = version->compaction_score_[0];
+
+  if ((version->files_[level].size() <
+       (unsigned int)options_->level0_file_num_compaction_trigger)) {
+    Log(options_->info_log, "Universal: nothing to do\n");
+    return nullptr;
+  }
+  Version::FileSummaryStorage tmp;
+  Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
+      version->files_[level].size(),
+      version->LevelFileSummary(&tmp, 0));
+
+  // Check for size amplification first.
+  Compaction* c = PickCompactionUniversalSizeAmp(version, score);
+  if (c == nullptr) {
+
+    // Size amplification is within limits. Try reducing read
+    // amplification while maintaining file size ratios.
+    unsigned int ratio = options_->compaction_options_universal.size_ratio;
+    c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX);
+
+    // Size amplification and file size ratios are within configured limits.
+    // If max read amplification is exceeding configured limits, then force
+    // compaction without looking at filesize ratios and try to reduce
+    // the number of files to fewer than level0_file_num_compaction_trigger.
+    if (c == nullptr) {
+      unsigned int num_files = version->files_[level].size() -
+                               options_->level0_file_num_compaction_trigger;
+      c = PickCompactionUniversalReadAmp(version, score, UINT_MAX, num_files);
+    }
+  }
+  if (c == nullptr) {
+    return nullptr;
+  }
+  assert(c->inputs_[0].size() > 1);
+
+  // validate that all the chosen files are non overlapping in time
+  FileMetaData* newerfile __attribute__((unused)) = nullptr;
+  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
+    FileMetaData* f = c->inputs_[0][i];
+    assert (f->smallest_seqno <= f->largest_seqno);
+    assert(newerfile == nullptr ||
+           newerfile->smallest_seqno > f->largest_seqno);
+    newerfile = f;
+  }
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
+
+  // Is the earliest file part of this compaction?
+  int last_index = file_by_time[file_by_time.size()-1];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
+  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
+    c->bottommost_level_ = true;
+  }
+
+  // update statistics
+  MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
+              c->inputs_[0].size());
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  // Record whether this compaction includes all sst files.
+  // For now, it is only relevant in universal compaction mode.
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
+
+  return c;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
+    Version* version, double score, unsigned int ratio,
+    unsigned int max_number_of_files_to_compact) {
+  int level = 0;
+
+  unsigned int min_merge_width =
+    options_->compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+    options_->compaction_options_universal.max_merge_width;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  FileMetaData* f = nullptr;
+  bool done = false;
+  int start_index = 0;
+  unsigned int candidate_count;
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int max_files_to_compact = std::min(max_merge_width,
+                                       max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
+
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (f = nullptr; loop < file_by_time.size(); loop++) {
+      int index = file_by_time[loop];
+      f = version->files_[level][index];
+
+      if (!f->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      Log(options_->info_log,
+          "Universal: file %lu[%d] being compacted, skipping",
+          (unsigned long)f->number, loop);
+      f = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
+    if (f != nullptr) {
+      Log(options_->info_log, "Universal: Possible candidate file %lu[%d].",
+          (unsigned long)f->number, loop);
+    }
+
+    // Check if the suceeding files need compaction.
+    for (unsigned int i = loop+1;
+         candidate_count < max_files_to_compact && i < file_by_time.size();
+         i++) {
+      int index = file_by_time[i];
+      FileMetaData* f = version->files_[level][index];
+      if (f->being_compacted) {
+        break;
+      }
+      // pick files if the total candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      uint64_t sz = (candidate_size * (100L + ratio)) /100;
+      if (sz < f->file_size) {
+        break;
+      }
+      candidate_count++;
+      candidate_size += f->file_size;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (unsigned int i = loop;
+           i < loop + candidate_count && i < file_by_time.size(); i++) {
+       int index = file_by_time[i];
+       FileMetaData* f = version->files_[level][index];
+       Log(options_->info_log,
+           "Universal: Skipping file %lu[%d] with size %lu %d\n",
+           (unsigned long)f->number,
+           i,
+           (unsigned long)f->file_size,
+           f->being_compacted);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  unsigned int first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      options_->compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = version->NumLevelBytes(level);
+    uint64_t older_file_size = 0;
+    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
+        i--) {
+      older_file_size += version->files_[level][file_by_time[i]]->file_size;
+      if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
+  c->score_ = score;
+
+  for (unsigned int i = start_index; i < first_index_after; i++) {
+    int index = file_by_time[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
+        (unsigned long)f->number,
+        i,
+        (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
+    Version* version, double score) {
+  int level = 0;
+
+  // percentage flexibilty while reducing size amplification
+  uint64_t ratio = options_->compaction_options_universal.
+                     max_size_amplification_percent;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  unsigned int start_index = 0;
+  FileMetaData* f = nullptr;
+
+  // Skip files that are already being compacted
+  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (!f->being_compacted) {
+      start_index = loop;         // Consider this as the first candidate.
+      break;
+    }
+    Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s",
+        (unsigned long)f->number,
+        loop,
+        " cannot be a candidate to reduce size amp.\n");
+    f = nullptr;
+  }
+  if (f == nullptr) {
+    return nullptr;             // no candidate files
+  }
+
+  Log(options_->info_log, "Universal: First candidate file %lu[%d] %s",
+      (unsigned long)f->number,
+      start_index,
+      " to reduce size amp.\n");
+
+  // keep adding up all the remaining files
+  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
+       loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (f->being_compacted) {
+      Log(options_->info_log,
+          "Universal: Possible candidate file %lu[%d] %s.",
+          (unsigned long)f->number,
+          loop,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += f->file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size of earliest file
+  int index = file_by_time[file_by_time.size() - 1];
+  uint64_t earliest_file_size = version->files_[level][index]->file_size;
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * earliest_file_size) {
+    Log(options_->info_log,
+        "Universal: size amp not needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+    return nullptr;
+  } else {
+    Log(options_->info_log,
+        "Universal: size amp needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+  }
+  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
+
+  // create a compaction request
+  // We always compact all the files, so always compress.
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
+  c->score_ = score;
+  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
+    int index = file_by_time[loop];
+    f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    Log(options_->info_log,
+        "Universal: size amp picking file %lu[%d] with size %lu",
+        (unsigned long)f->number,
+        index,
+        (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
new file mode 100644
index 000000000..0fe086a18
--- /dev/null
+++ b/db/compaction_picker.h
@@ -0,0 +1,162 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "db/compaction.h"
+#include "rocksdb/status.h"
+#include "rocksdb/options.h"
+
+#include <vector>
+#include <memory>
+#include <set>
+
+namespace rocksdb {
+
+class Compaction;
+class Version;
+
+class CompactionPicker {
+ public:
+  CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
+  virtual ~CompactionPicker();
+
+  // See VersionSet::ReduceNumberOfLevels()
+  void ReduceNumberOfLevels(int new_levels);
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  virtual Compaction* PickCompaction(Version* version) = 0;
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(Version* version, int input_level, int output_level,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end);
+
+  // Free up the files that participated in a compaction
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // Return the total amount of data that is undergoing
+  // compactions per level
+  void SizeBeingCompacted(std::vector<uint64_t>& sizes);
+
+  // Returns maximum total overlap bytes with grandparent
+  // level (i.e., level+2) before we stop building a single
+  // file in level->level+1 compaction.
+  uint64_t MaxGrandParentOverlapBytes(int level);
+
+  // Returns maximum total bytes of data on a given level.
+  double MaxBytesForLevel(int level);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level) const;
+
+ protected:
+  int NumberLevels() const { return num_levels_; }
+
+  // Stores the minimal range that covers all entries in inputs in
+  // *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
+                InternalKey* largest);
+
+  // Stores the minimal range that covers all entries in inputs1 and inputs2
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs1,
+                const std::vector<FileMetaData*>& inputs2,
+                InternalKey* smallest, InternalKey* largest);
+
+  // Add more files to the inputs on "level" to make sure that
+  // no newer version of a key is compacted to "level+1" while leaving an older
+  // version in a "level". Otherwise, any Get() will search "level" first,
+  // and will likely return an old/stale value for the key, since it always
+  // searches in increasing order of level to find the value. This could
+  // also scramble the order of merge operands. This function should be
+  // called any time a new Compaction is created, and its inputs_[0] are
+  // populated.
+  //
+  // Will return false if it is impossible to apply this compaction.
+  bool ExpandWhileOverlapping(Compaction* c);
+
+  uint64_t ExpandedCompactionByteSizeLimit(int level);
+
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+
+  // Returns true if any one of the parent files are being compacted
+  bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
+                               const InternalKey* largest, int level,
+                               int* index);
+
+  void SetupOtherInputs(Compaction* c);
+
+  // record all the ongoing compactions for all levels
+  std::vector<std::set<Compaction*>> compactions_in_progress_;
+
+  // Per-level target file size.
+  std::unique_ptr<uint64_t[]> max_file_size_;
+
+  // Per-level max bytes
+  std::unique_ptr<uint64_t[]> level_max_bytes_;
+
+  const Options* const options_;
+ private:
+  void Init();
+
+  int num_levels_;
+
+  const InternalKeyComparator* const icmp_;
+};
+
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const Options* options,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version) override;
+
+ private:
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
+                                             unsigned int ratio,
+                                             unsigned int num_files);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionUniversalSizeAmp(Version* version, double score);
+};
+
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const Options* options,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version) override;
+
+ private:
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(Version* version, int level, double score);
+};
+
+}  // namespace rocksdb
diff --git a/db/db_bench.cc b/db/db_bench.cc
index e0ba58281..e41a31cf3 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -14,7 +14,7 @@
 #include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "db/version_set.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/options.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
@@ -30,6 +30,7 @@
 #include "util/random.h"
 #include "util/stack_trace.h"
 #include "util/string_util.h"
+#include "util/statistics.h"
 #include "util/testutil.h"
 #include "hdfs/env_hdfs.h"
 #include "utilities/merge_operators.h"
@@ -355,9 +356,9 @@ static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
   return true;
 }
 
-static const bool FLAGS_compression_level_dummy =
-  google::RegisterFlagValidator(&FLAGS_compression_level,
-                                &ValidateCompressionLevel);
+static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_compression_level,
+                                  &ValidateCompressionLevel);
 
 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
              " from this level. Levels with number < min_level_to_compress are"
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index a7232246a..04d6d0e17 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -74,7 +74,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   // Make a set of all of the live *.sst files
   std::set<uint64_t> live;
-  versions_->AddLiveFilesCurrentVersion(&live);
+  versions_->current()->AddLiveFiles(&live);
 
   ret.clear();
   ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
diff --git a/db/db_impl.cc b/db/db_impl.cc
index d07868d21..cb23c979e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -57,6 +57,7 @@
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -254,8 +255,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
       internal_comparator_(options.comparator),
-      options_(SanitizeOptions(
-          dbname, &internal_comparator_, &internal_filter_policy_, options)),
+      options_(SanitizeOptions(dbname, &internal_comparator_,
+                               &internal_filter_policy_, options)),
       internal_filter_policy_(options.filter_policy),
       owns_info_log_(options_.info_log != options.info_log),
       db_lock_(nullptr),
@@ -263,8 +264,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       shutting_down_(nullptr),
       bg_cv_(&mutex_),
       mem_rep_factory_(options_.memtable_factory.get()),
-      mem_(new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_)),
+      mem_(new MemTable(internal_comparator_, options_)),
       logfile_number_(0),
       super_version_(nullptr),
       tmp_batch_(),
@@ -410,7 +410,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
 }
 
 Status DBImpl::NewDB() {
-  VersionEdit new_db(NumberLevels());
+  VersionEdit new_db;
   new_db.SetComparatorName(user_comparator()->Name());
   new_db.SetLogNumber(0);
   new_db.SetNextFile(2);
@@ -1048,8 +1048,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
     WriteBatchInternal::SetContents(&batch, record);
 
     if (mem == nullptr) {
-      mem = new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_);
+      mem = new MemTable(internal_comparator_, options_);
       mem->Ref();
     }
     status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
@@ -1300,6 +1299,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
 void DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
                           const Slice* begin, const Slice* end,
                           bool reduce_level, int target_level) {
+  FlushMemTable(FlushOptions());
   int max_level_with_files = 1;
   {
     MutexLock l(&mutex_);
@@ -1310,9 +1310,15 @@ void DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
       }
     }
   }
-  TEST_FlushMemTable(); // TODO(sanjay): Skip if memtable does not overlap
-  for (int level = 0; level < max_level_with_files; level++) {
-    TEST_CompactRange(level, begin, end);
+  for (int level = 0; level <= max_level_with_files; level++) {
+    // in case the compaction is unversal or if we're compacting the
+    // bottom-most level, the output level will be the same as input one
+    if (options_.compaction_style == kCompactionStyleUniversal ||
+        level == max_level_with_files) {
+      RunManualCompaction(level, level, begin, end);
+    } else {
+      RunManualCompaction(level, level + 1, begin, end);
+    }
   }
 
   if (reduce_level) {
@@ -1324,13 +1330,13 @@ void DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
 // return the same level if it cannot be moved
 int DBImpl::FindMinimumEmptyLevelFitting(int level) {
   mutex_.AssertHeld();
+  Version* current = versions_->current();
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (versions_->NumLevelFiles(i) > 0) break;
-
+    if (current->NumLevelFiles(i) > 0) break;
     // stop if level i is too small (cannot fit the level files)
-    if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break;
+    if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break;
 
     minimum_level = i;
   }
@@ -1376,7 +1382,7 @@ void DBImpl::ReFitLevel(int level, int target_level) {
     Log(options_.info_log, "Before refitting:\n%s",
         versions_->current()->DebugString().data());
 
-    VersionEdit edit(NumberLevels());
+    VersionEdit edit;
     for (const auto& f : versions_->current()->files_[level]) {
       edit.DeleteFile(level, f->number);
       edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
@@ -1612,13 +1618,17 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path,
   return status;
 }
 
-void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
-  assert(level >= 0);
+void DBImpl::RunManualCompaction(int input_level,
+                                 int output_level,
+                                 const Slice* begin,
+                                 const Slice* end) {
+  assert(input_level >= 0);
 
   InternalKey begin_storage, end_storage;
 
   ManualCompaction manual;
-  manual.level = level;
+  manual.input_level = input_level;
+  manual.output_level = output_level;
   manual.done = false;
   manual.in_progress = false;
   // For universal compaction, we enforce every manual compaction to compact
@@ -1646,11 +1656,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   // can compact any range of keys/files.
   //
   // bg_manual_only_ is non-zero when at least one thread is inside
-  // TEST_CompactRange(), i.e. during that time no other compaction will
+  // RunManualCompaction(), i.e. during that time no other compaction will
   // get scheduled (see MaybeScheduleFlushOrCompaction).
   //
   // Note that the following loop doesn't stop more that one thread calling
-  // TEST_CompactRange() from getting to the second while loop below.
+  // RunManualCompaction() from getting to the second while loop below.
   // However, only one of them will actually schedule compaction, while
   // others will wait on a condition variable until it completes.
 
@@ -1680,6 +1690,15 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   --bg_manual_only_;
 }
 
+void DBImpl::TEST_CompactRange(int level,
+                               const Slice* begin,
+                               const Slice* end) {
+  int output_level = (options_.compaction_style == kCompactionStyleUniversal)
+                         ? level
+                         : level + 1;
+  RunManualCompaction(level, output_level, begin, end);
+}
+
 Status DBImpl::FlushMemTable(const FlushOptions& options) {
   // nullptr batch means just wait for earlier writes to be done
   Status s = Write(WriteOptions(), nullptr);
@@ -1825,6 +1844,11 @@ void DBImpl::TEST_PurgeObsoleteteWAL() {
   PurgeObsoleteWALFiles();
 }
 
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  MutexLock l(&mutex_);
+  return versions_->current()->NumLevelBytes(0);
+}
+
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
   DeletionState deletion_state(options_.max_write_buffer_number, true);
@@ -1899,23 +1923,27 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   unique_ptr<Compaction> c;
   bool is_manual = (manual_compaction_ != nullptr) &&
                    (manual_compaction_->in_progress == false);
-  InternalKey manual_end;
+  InternalKey manual_end_storage;
+  InternalKey* manual_end = &manual_end_storage;
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
     assert(!m->in_progress);
     m->in_progress = true; // another thread cannot pick up the same work
-    c.reset(versions_->CompactRange(m->level, m->begin, m->end));
-    if (c) {
-      manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
-    } else {
+    c.reset(versions_->CompactRange(
+        m->input_level, m->output_level, m->begin, m->end, &manual_end));
+    if (!c) {
       m->done = true;
     }
     Log(options_.info_log,
-        "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
-        m->level,
+        "Manual compaction from level-%d to level-%d from %s .. %s; will stop "
+        "at %s\n",
+        m->input_level,
+        m->output_level,
         (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
         (m->end ? m->end->DebugString().c_str() : "(end)"),
-        (m->done ? "(end)" : manual_end.DebugString().c_str()));
+        ((m->done || manual_end == nullptr)
+             ? "(end)"
+             : manual_end->DebugString().c_str()));
   } else if (!options_.disable_auto_compactions) {
     c.reset(versions_->PickCompaction());
   }
@@ -1934,13 +1962,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
                        f->smallest_seqno, f->largest_seqno);
     status = versions_->LogAndApply(c->edit(), &mutex_);
     InstallSuperVersion(deletion_state);
-    VersionSet::LevelSummaryStorage tmp;
+    Version::LevelSummaryStorage tmp;
     Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
-        static_cast<unsigned long long>(f->number),
-        c->level() + 1,
+        static_cast<unsigned long long>(f->number), c->level() + 1,
         static_cast<unsigned long long>(f->file_size),
-        status.ToString().c_str(),
-        versions_->LevelSummary(&tmp));
+        status.ToString().c_str(), versions_->current()->LevelSummary(&tmp));
     versions_->ReleaseCompactionFiles(c.get(), status);
     *madeProgress = true;
   } else {
@@ -1980,13 +2006,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     //   Also note that, if we don't stop here, then the current compaction
     //   writes a new file back to level 0, which will be used in successive
     //   compaction. Hence the manual compaction will never finish.
-    if (options_.compaction_style == kCompactionStyleUniversal) {
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (manual_end == nullptr) {
       m->done = true;
     }
     if (!m->done) {
       // We only compacted part of the requested range.  Update *m
       // to the range that is left to be compacted.
-      m->tmp_storage = manual_end;
+      // Universal compaction should always compact the whole range
+      assert(options_.compaction_style != kCompactionStyleUniversal);
+      m->tmp_storage = *manual_end;
       m->begin = &m->tmp_storage;
     }
     m->in_progress = false; // not being processed anymore
@@ -2018,14 +2050,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
 }
 
 // Allocate the file numbers for the output file. We allocate as
-// many output file numbers as there are files in level+1.
+// many output file numbers as there are files in level+1 (at least one)
 // Insert them into pending_outputs so that they do not get deleted.
 void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
   mutex_.AssertHeld();
   assert(compact != nullptr);
   assert(compact->builder == nullptr);
   int filesNeeded = compact->compaction->num_input_files(1);
-  for (int i = 0; i < filesNeeded; i++) {
+  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
     uint64_t file_number = versions_->NewFileNumber();
     pending_outputs_.insert(file_number);
     compact->allocated_file_numbers.push_back(file_number);
@@ -2169,14 +2201,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
 
   // Add compaction outputs
   compact->compaction->AddInputDeletions(compact->compaction->edit());
-  const int level = compact->compaction->level();
   for (size_t i = 0; i < compact->outputs.size(); i++) {
     const CompactionState::Output& out = compact->outputs[i];
     compact->compaction->edit()->AddFile(
-        (options_.compaction_style == kCompactionStyleUniversal) ?
-          level : level + 1,
-        out.number, out.file_size, out.smallest, out.largest,
-        out.smallest_seqno, out.largest_seqno);
+        compact->compaction->output_level(), out.number, out.file_size,
+        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
   }
   return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
 }
@@ -2218,14 +2247,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       compact->compaction->num_input_files(0),
       compact->compaction->level(),
       compact->compaction->num_input_files(1),
-      compact->compaction->level() + 1,
+      compact->compaction->output_level(),
       compact->compaction->score(),
       options_.max_background_compactions - bg_compaction_scheduled_);
   char scratch[256];
   compact->compaction->Summary(scratch, sizeof(scratch));
   Log(options_.info_log, "Compaction start summary: %s\n", scratch);
 
-  assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0);
   assert(compact->builder == nullptr);
   assert(!compact->outfile);
 
@@ -2553,9 +2582,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
   CompactionStats stats;
   stats.micros = env_->NowMicros() - start_micros - imm_micros;
-  if (options_.statistics.get()) {
-    options_.statistics.get()->measureTime(COMPACTION_TIME, stats.micros);
-  }
+  MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros);
   stats.files_in_leveln = compact->compaction->num_input_files(0);
   stats.files_in_levelnp1 = compact->compaction->num_input_files(1);
 
@@ -2597,22 +2624,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     status = InstallCompactionResults(compact);
     InstallSuperVersion(deletion_state);
   }
-  VersionSet::LevelSummaryStorage tmp;
+  Version::LevelSummaryStorage tmp;
   Log(options_.info_log,
       "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
       "write-amplify(%.1f) %s\n",
-      versions_->LevelSummary(&tmp),
+      versions_->current()->LevelSummary(&tmp),
       (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
-          (double) stats.micros,
-      compact->compaction->output_level(),
-      stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1,
-      stats.bytes_readn / 1048576.0,
-      stats.bytes_readnp1 / 1048576.0,
+          (double)stats.micros,
+      compact->compaction->output_level(), stats.files_in_leveln,
+      stats.files_in_levelnp1, stats.files_out_levelnp1,
+      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
       stats.bytes_written / 1048576.0,
       (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-          (double) stats.bytes_readn,
-      stats.bytes_written / (double) stats.bytes_readn,
+          (double)stats.bytes_readn,
+      stats.bytes_written / (double)stats.bytes_readn,
       status.ToString().c_str());
 
   return status;
@@ -2649,38 +2675,40 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
   IterState* cleanup = new IterState;
-  mutex_.Lock();
-  *latest_snapshot = versions_->LastSequence();
+  MemTable* mutable_mem;
+  std::vector<MemTable*> immutables;
+  Version* version;
 
   // Collect together all needed child iterators for mem
-  std::vector<Iterator*> list;
+  mutex_.Lock();
+  *latest_snapshot = versions_->LastSequence();
   mem_->Ref();
-  list.push_back(mem_->NewIterator(options));
-
-  cleanup->mem.push_back(mem_);
-
+  mutable_mem = mem_;
   // Collect together all needed child iterators for imm_
-  std::vector<MemTable*> immutables;
   imm_.GetMemTables(&immutables);
   for (unsigned int i = 0; i < immutables.size(); i++) {
-    MemTable* m = immutables[i];
-    m->Ref();
+    immutables[i]->Ref();
+  }
+  // Collect iterators for files in L0 - Ln
+  versions_->current()->Ref();
+  version = versions_->current();
+  mutex_.Unlock();
+
+  std::vector<Iterator*> list;
+  list.push_back(mutable_mem->NewIterator(options));
+  cleanup->mem.push_back(mutable_mem);
+  for (MemTable* m : immutables) {
     list.push_back(m->NewIterator(options));
     cleanup->mem.push_back(m);
   }
-
-  // Collect iterators for files in L0 - Ln
-  versions_->current()->AddIterators(options, storage_options_, &list);
+  version->AddIterators(options, storage_options_, &list);
   Iterator* internal_iter =
       NewMergingIterator(&internal_comparator_, &list[0], list.size());
-  versions_->current()->Ref();
-
+  cleanup->version = version;
   cleanup->mu = &mutex_;
   cleanup->db = this;
-  cleanup->version = versions_->current();
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
-  mutex_.Unlock();
   return internal_iter;
 }
 
@@ -2691,7 +2719,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
   MutexLock l(&mutex_);
-  return versions_->MaxNextLevelOverlappingBytes();
+  return versions_->current()->MaxNextLevelOverlappingBytes();
 }
 
 Status DBImpl::Get(const ReadOptions& options,
@@ -2898,7 +2926,7 @@ std::vector<Status> DBImpl::MultiGet(
 Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
                                   const std::string& column_family_name,
                                   ColumnFamilyHandle* handle) {
-  VersionEdit edit(0);
+  VersionEdit edit;
   edit.AddColumnFamily(column_family_name);
   MutexLock l(&mutex_);
   ++versions_->max_column_family_;
@@ -2920,7 +2948,7 @@ Status DBImpl::DropColumnFamily(const ColumnFamilyHandle& column_family) {
   if (column_family.id == 0) {
     return Status::InvalidArgument("Can't drop default column family");
   }
-  VersionEdit edit(0);
+  VersionEdit edit;
   edit.DropColumnFamily();
   edit.SetColumnFamily(column_family.id);
   MutexLock l(&mutex_);
@@ -3045,12 +3073,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   uint64_t last_sequence = versions_->LastSequence();
   Writer* last_writer = &w;
   if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
-    // TODO: BuildBatchGroup physically concatenate/copy all write batches into
-    // a new one. Mem copy is done with the lock held. Ideally, we only need
-    // the lock to obtain the last_writer and the references to all batches.
-    // Creation (copy) of the merged batch could have been done outside of the
-    // lock protected region.
-    WriteBatch* updates = BuildBatchGroup(&last_writer);
+    autovector<WriteBatch*> write_batch_group;
+    BuildBatchGroup(&last_writer, &write_batch_group);
 
     // Add to log and apply to memtable.  We can release the lock
     // during this phase since &w is currently responsible for logging
@@ -3058,6 +3082,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     // into mem_.
     {
       mutex_.Unlock();
+      WriteBatch* updates = nullptr;
+      if (write_batch_group.size() == 1) {
+        updates = write_batch_group[0];
+      } else {
+        updates = &tmp_batch_;
+        for (size_t i = 0; i < write_batch_group.size(); ++i) {
+          WriteBatchInternal::Append(updates, write_batch_group[i]);
+        }
+      }
+
       const SequenceNumber current_sequence = last_sequence + 1;
       WriteBatchInternal::SetSequence(updates, current_sequence);
       int my_batch_count = WriteBatchInternal::Count(updates);
@@ -3100,15 +3134,15 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
           // have succeeded in memtable but Status reports error for all writes.
           throw std::runtime_error("In memory WriteBatch corruption!");
         }
-        SetTickerCount(options_.statistics.get(),
-                       SEQUENCE_NUMBER, last_sequence);
+        SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
+                       last_sequence);
       }
+      if (updates == &tmp_batch_) tmp_batch_.Clear();
       mutex_.Lock();
       if (status.ok()) {
         versions_->SetLastSequence(last_sequence);
       }
     }
-    if (updates == &tmp_batch_) tmp_batch_.Clear();
   }
   if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
     bg_error_ = status; // stop compaction & fail any further writes
@@ -3136,13 +3170,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
 // REQUIRES: Writer list must be non-empty
 // REQUIRES: First writer must have a non-nullptr batch
-WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+void DBImpl::BuildBatchGroup(Writer** last_writer,
+                             autovector<WriteBatch*>* write_batch_group) {
   assert(!writers_.empty());
   Writer* first = writers_.front();
-  WriteBatch* result = first->batch;
-  assert(result != nullptr);
+  assert(first->batch != nullptr);
 
   size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
 
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
@@ -3175,18 +3210,10 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
         break;
       }
 
-      // Append to *reuslt
-      if (result == first->batch) {
-        // Switch to temporary batch instead of disturbing caller's batch
-        result = &tmp_batch_;
-        assert(WriteBatchInternal::Count(result) == 0);
-        WriteBatchInternal::Append(result, first->batch);
-      }
-      WriteBatchInternal::Append(result, w->batch);
+      write_batch_group->push_back(w->batch);
     }
     *last_writer = w;
   }
-  return result;
 }
 
 // This function computes the amount of time in microseconds by which a write
@@ -3200,7 +3227,7 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
 // The goal of this formula is to gradually increase the rate at which writes
 // are slowed. We also tried linear delay (r * 1000), but it seemed to do
 // slightly worse. There is no other particular reason for choosing quadratic.
-uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
+uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
   uint64_t delay;
   if (n >= top) {
     delay = 1000;
@@ -3212,10 +3239,10 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
     // If we are here, we know that:
     //   level0_start_slowdown <= n < level0_slowdown
     // since the previous two conditions are false.
-    float how_much =
-      (float) (n - bottom) /
+    double how_much =
+      (double) (n - bottom) /
               (top - bottom);
-    delay = how_much * how_much * 1000;
+    delay = std::max(how_much * how_much * 1000, 100.0);
   }
   assert(delay <= 1000);
   return delay;
@@ -3240,25 +3267,22 @@ Status DBImpl::MakeRoomForWrite(bool force,
       // Yield previous error
       s = bg_error_;
       break;
-    } else if (
-        allow_delay &&
-        versions_->NumLevelFiles(0) >=
-          options_.level0_slowdown_writes_trigger) {
+    } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
       // We are getting close to hitting a hard limit on the number of
       // L0 files.  Rather than delaying a single write by several
       // seconds when we hit the hard limit, start delaying each
       // individual write by 0-1ms to reduce latency variance.  Also,
       // this delay hands over some CPU to the compaction thread in
       // case it is sharing the same core as the writer.
+      uint64_t slowdown =
+          SlowdownAmount(versions_->current()->NumLevelFiles(0),
+                         options_.level0_slowdown_writes_trigger,
+                         options_.level0_stop_writes_trigger);
       mutex_.Unlock();
       uint64_t delayed;
       {
         StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
-        env_->SleepForMicroseconds(
-          SlowdownAmount(versions_->NumLevelFiles(0),
-                         options_.level0_slowdown_writes_trigger,
-                         options_.level0_stop_writes_trigger)
-        );
+        env_->SleepForMicroseconds(slowdown);
         delayed = sw.ElapsedMicros();
       }
       RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed);
@@ -3290,7 +3314,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
                  STALL_MEMTABLE_COMPACTION_MICROS, stall);
       stall_memtable_compaction_ += stall;
       stall_memtable_compaction_count_++;
-    } else if (versions_->NumLevelFiles(0) >=
+    } else if (versions_->current()->NumLevelFiles(0) >=
                options_.level0_stop_writes_trigger) {
       // There are too many level-0 files.
       DelayLoggingAndReset();
@@ -3366,17 +3390,13 @@ Status DBImpl::MakeRoomForWrite(bool force,
         EnvOptions soptions(storage_options_);
         soptions.use_mmap_writes = false;
         DelayLoggingAndReset();
-        s = env_->NewWritableFile(
-            LogFileName(options_.wal_dir, new_log_number),
-            &lfile,
-            soptions
-          );
+        s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
+                                  &lfile, soptions);
         if (s.ok()) {
           // Our final size should be less than write_buffer_size
           // (compression, etc) but err on the side of caution.
           lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
-          memtmp = new MemTable(
-            internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
+          memtmp = new MemTable(internal_comparator_, options_);
           new_superversion = new SuperVersion(options_.max_write_buffer_number);
         }
       }
@@ -3426,6 +3446,7 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
   value->clear();
 
   MutexLock l(&mutex_);
+  Version* current = versions_->current();
   Slice in = property;
   Slice prefix("rocksdb.");
   if (!in.starts_with(prefix)) return false;
@@ -3440,7 +3461,7 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
     } else {
       char buf[100];
       snprintf(buf, sizeof(buf), "%d",
-               versions_->NumLevelFiles(static_cast<int>(level)));
+               current->NumLevelFiles(static_cast<int>(level)));
       *value = buf;
       return true;
     }
@@ -3455,8 +3476,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
       snprintf(buf, sizeof(buf),
                "%3d %8d %8.0f\n",
                level,
-               versions_->NumLevelFiles(level),
-               versions_->NumLevelBytes(level) / 1048576.0);
+               current->NumLevelFiles(level),
+               current->NumLevelBytes(level) / 1048576.0);
       value->append(buf);
     }
     return true;
@@ -3499,8 +3520,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
              "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"
              );
     value->append(buf);
-    for (int level = 0; level < NumberLevels(); level++) {
-      int files = versions_->NumLevelFiles(level);
+    for (int level = 0; level < current->NumberLevels(); level++) {
+      int files = current->NumLevelFiles(level);
       if (stats_[level].micros > 0 || files > 0) {
         int64_t bytes_read = stats_[level].bytes_readn +
                              stats_[level].bytes_readnp1;
@@ -3521,8 +3542,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
             "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n",
             level,
             files,
-            versions_->NumLevelBytes(level) / 1048576.0,
-            versions_->NumLevelBytes(level) /
+            current->NumLevelBytes(level) / 1048576.0,
+            current->NumLevelBytes(level) /
                 versions_->MaxBytesForLevel(level),
             stats_[level].micros / 1e6,
             bytes_read / 1048576.0,
@@ -3758,7 +3779,7 @@ Status DBImpl::DeleteFile(std::string name) {
   int level;
   FileMetaData metadata;
   int maxlevel = NumberLevels();
-  VersionEdit edit(maxlevel);
+  VersionEdit edit;
   DeletionState deletion_state(0, true);
   {
     MutexLock l(&mutex_);
@@ -3781,7 +3802,7 @@ Status DBImpl::DeleteFile(std::string name) {
     // This is to make sure that any deletion tombstones are not
     // lost. Check that the level passed is the last level.
     for (int i = level + 1; i < maxlevel; i++) {
-      if (versions_->NumLevelFiles(i) != 0) {
+      if (versions_->current()->NumLevelFiles(i) != 0) {
         Log(options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
         return Status::InvalidArgument("File not in last level");
@@ -3836,7 +3857,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) {
 // can call if they wish
 Status DB::Put(const WriteOptions& opt, const ColumnFamilyHandle& column_family,
                const Slice& key, const Slice& value) {
-  WriteBatch batch;
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24);
   batch.Put(column_family.id, key, value);
   return Write(opt, &batch);
 }
@@ -3915,20 +3939,20 @@ Status DB::OpenWithColumnFamilies(
     return s;
   }
   impl->mutex_.Lock();
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   // Handles create_if_missing, error_if_exists
   s = impl->Recover(&edit, column_families);
   if (s.ok()) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
     unique_ptr<WritableFile> lfile;
     soptions.use_mmap_writes = false;
-    s = options.env->NewWritableFile(
+    s = impl->options_.env->NewWritableFile(
       LogFileName(impl->options_.wal_dir, new_log_number),
       &lfile,
       soptions
     );
     if (s.ok()) {
-      lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size);
+      lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
       edit.SetLogNumber(new_log_number);
       impl->logfile_number_ = new_log_number;
       impl->log_.reset(new log::Writer(std::move(lfile)));
@@ -3949,12 +3973,11 @@ Status DB::OpenWithColumnFamilies(
       impl->MaybeScheduleLogDBDeployStats();
     }
   }
-  impl->mutex_.Unlock();
 
-  if (s.ok() && options.compaction_style == kCompactionStyleUniversal) {
-    int num_files;
+  if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) {
+    Version* current = impl->versions_->current();
     for (int i = 1; i < impl->NumberLevels(); i++) {
-      num_files = impl->versions_->NumLevelFiles(i);
+      int num_files = current->NumLevelFiles(i);
       if (num_files > 0) {
         s = Status::InvalidArgument("Not all files are at level 0. Cannot "
           "open with universal compaction style.");
@@ -3963,6 +3986,8 @@ Status DB::OpenWithColumnFamilies(
     }
   }
 
+  impl->mutex_.Unlock();
+
   if (s.ok()) {
     *dbptr = impl;
   } else {
diff --git a/db/db_impl.h b/db/db_impl.h
index 9baea728f..9146df7bd 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -22,6 +22,7 @@
 #include "port/port.h"
 #include "util/stats_logger.h"
 #include "memtablelist.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -125,10 +126,17 @@ class DBImpl : public DB {
 
   virtual Status GetDbIdentity(std::string& identity);
 
+  void RunManualCompaction(int input_level,
+                           int output_level,
+                           const Slice* begin,
+                           const Slice* end);
+
   // Extra methods (for testing) that are not in the public DB interface
 
   // Compact any files in the named level that overlap [*begin, *end]
-  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+  void TEST_CompactRange(int level,
+                         const Slice* begin,
+                         const Slice* end);
 
   // Force current memtable contents to be flushed.
   Status TEST_FlushMemTable();
@@ -158,7 +166,7 @@ class DBImpl : public DB {
   void TEST_PurgeObsoleteteWAL();
 
   // get total level0 file size. Only for testing.
-  uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}
+  uint64_t TEST_GetLevel0TotalSize();
 
   void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
   {
@@ -324,13 +332,14 @@ class DBImpl : public DB {
   Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
                                 uint64_t* filenumber);
 
-  uint64_t SlowdownAmount(int n, int top, int bottom);
+  uint64_t SlowdownAmount(int n, double bottom, double top);
   // MakeRoomForWrite will return superversion_to_free through an arugment,
   // which the caller needs to delete. We do it because caller can delete
   // the superversion outside of mutex
   Status MakeRoomForWrite(bool force /* compact even if there is room? */,
                           SuperVersion** superversion_to_free);
-  WriteBatch* BuildBatchGroup(Writer** last_writer);
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
 
   // Force current memtable contents to be flushed.
   Status FlushMemTable(const FlushOptions& options);
@@ -443,7 +452,8 @@ class DBImpl : public DB {
 
   // Information for a manual compaction
   struct ManualCompaction {
-    int level;
+    int input_level;
+    int output_level;
     bool done;
     bool in_progress;           // compaction request being processed?
     const InternalKey* begin;   // nullptr means beginning of key range
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index dee484951..ad3395778 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -85,7 +85,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
 
   DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
   impl->mutex_.Lock();
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   DBOptions db_options(options);
   ColumnFamilyOptions cf_options(options);
   std::vector<ColumnFamilyDescriptor> column_families;
diff --git a/db/db_statistics.cc b/db/db_statistics.cc
deleted file mode 100644
index f0cfd6740..000000000
--- a/db/db_statistics.cc
+++ /dev/null
@@ -1,14 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#include "db/db_statistics.h"
-
-namespace rocksdb {
-
-std::shared_ptr<Statistics> CreateDBStatistics() {
-  return std::make_shared<DBStatistics>();
-}
-
-} // namespace rocksdb
diff --git a/db/db_statistics.h b/db/db_statistics.h
deleted file mode 100644
index ec71e1688..000000000
--- a/db/db_statistics.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#include <cassert>
-#include <stdlib.h>
-#include <vector>
-#include <memory>
-
-#include "rocksdb/statistics.h"
-#include "util/histogram.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
-
-
-namespace rocksdb {
-
-class DBStatistics: public Statistics {
- public:
-  DBStatistics() : allTickers_(TICKER_ENUM_MAX),
-                   allHistograms_(HISTOGRAM_ENUM_MAX) { }
-
-  virtual ~DBStatistics() {}
-
-  virtual long getTickerCount(Tickers tickerType) {
-    assert(tickerType < TICKER_ENUM_MAX);
-    return allTickers_[tickerType].getCount();
-  }
-
-  virtual void setTickerCount(Tickers tickerType, uint64_t count) {
-    assert(tickerType < TICKER_ENUM_MAX);
-    allTickers_[tickerType].setTickerCount(count);
-  }
-
-  virtual void recordTick(Tickers tickerType, uint64_t count) {
-    assert(tickerType < TICKER_ENUM_MAX);
-    allTickers_[tickerType].recordTick(count);
-  }
-
-  virtual void measureTime(Histograms histogramType, uint64_t value) {
-    assert(histogramType < HISTOGRAM_ENUM_MAX);
-    allHistograms_[histogramType].Add(value);
-  }
-
-  virtual void histogramData(Histograms histogramType,
-                             HistogramData * const data) {
-    assert(histogramType < HISTOGRAM_ENUM_MAX);
-    allHistograms_[histogramType].Data(data);
-  }
-
-  std::vector<Ticker> allTickers_;
-  std::vector<HistogramImpl> allHistograms_;
-};
-
-std::shared_ptr<Statistics> CreateDBStatistics();
-
-} // namespace rocksdb
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
index 91810abe3..db86865ca 100644
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@@ -65,13 +65,14 @@ void DBImpl::LogDBDeployStats() {
 
   uint64_t file_total_size = 0;
   uint32_t file_total_num = 0;
-  for (int i = 0; i < versions_->NumberLevels(); i++) {
-    file_total_num += versions_->NumLevelFiles(i);
-    file_total_size += versions_->NumLevelBytes(i);
+  Version* current = versions_->current();
+  for (int i = 0; i < current->NumberLevels(); i++) {
+    file_total_num += current->NumLevelFiles(i);
+    file_total_size += current->NumLevelBytes(i);
   }
 
-  VersionSet::LevelSummaryStorage scratch;
-  const char* file_num_summary = versions_->LevelSummary(&scratch);
+  Version::LevelSummaryStorage scratch;
+  const char* file_num_summary = current->LevelSummary(&scratch);
   std::string file_num_per_level(file_num_summary);
   std::string data_size_per_level(file_num_summary);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 3659e8d84..44ce16d60 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -17,7 +17,6 @@
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "db/db_statistics.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
@@ -27,6 +26,7 @@
 #include "util/mutexlock.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "util/statistics.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -680,6 +680,10 @@ static std::string Key(int i) {
   return std::string(buf);
 }
 
+static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+
 TEST(DBTest, Empty) {
   do {
     ASSERT_TRUE(db_ != nullptr);
@@ -713,14 +717,11 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   dbfull()->Flush(FlushOptions());
 
   // index/filter blocks added to block cache right after table creation.
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_INDEX_MISS));
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
   ASSERT_EQ(2, /* only index/filter were added */
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
-  ASSERT_EQ(0,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_DATA_MISS));
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
 
   // Make sure filter block is in cache.
   std::string value;
@@ -728,31 +729,24 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   db_->KeyMayExist(ReadOptions(), "key", &value);
 
   // Miss count should remain the same.
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
   db_->KeyMayExist(ReadOptions(), "key", &value);
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(2,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
   // Make sure index block is in cache.
-  auto index_block_hit =
-    options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
   value = Get("key");
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
   ASSERT_EQ(index_block_hit + 1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
   value = Get("key");
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
   ASSERT_EQ(index_block_hit + 2,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 }
 
 TEST(DBTest, LevelLimitReopen) {
@@ -768,10 +762,9 @@ TEST(DBTest, LevelLimitReopen) {
   options.num_levels = 1;
   options.max_bytes_for_level_multiplier_additional.resize(1, 1);
   Status s = TryReopen(&options);
-  ASSERT_EQ(s.IsCorruption(), true);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
   ASSERT_EQ(s.ToString(),
-            "Corruption: VersionEdit: column family already has "
-            "more levels than specified");
+            "Invalid argument: db has more levels than options.num_levels");
 
   options.num_levels = 10;
   options.max_bytes_for_level_multiplier_additional.resize(10, 1);
@@ -968,47 +961,39 @@ TEST(DBTest, KeyMayExist) {
     dbfull()->Flush(FlushOptions());
     value.clear();
 
-    long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    long cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found));
     ASSERT_TRUE(!value_found);
     // assert that no new files were opened and no new blocks were
     // read into block cache.
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     ASSERT_OK(db_->Delete(WriteOptions(), "a"));
 
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     dbfull()->Flush(FlushOptions());
     dbfull()->CompactRange(nullptr, nullptr);
 
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     ASSERT_OK(db_->Delete(WriteOptions(), "c"));
 
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value));
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     delete options.filter_policy;
   } while (ChangeOptions());
@@ -1041,9 +1026,8 @@ TEST(DBTest, NonBlockingIteration) {
 
     // verify that a non-blocking iterator does not find any
     // kvs. Neither does it do any IOs to storage.
-    long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    long cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     iter = db_->NewIterator(non_blocking_opts);
     count = 0;
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -1051,18 +1035,16 @@ TEST(DBTest, NonBlockingIteration) {
     }
     ASSERT_EQ(count, 0);
     ASSERT_TRUE(iter->status().IsIncomplete());
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
     delete iter;
 
     // read in the specified block via a regular get
     ASSERT_EQ(Get("a"), "b");
 
     // verify that we can find it via a non-blocking scan
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     iter = db_->NewIterator(non_blocking_opts);
     count = 0;
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -1070,9 +1052,8 @@ TEST(DBTest, NonBlockingIteration) {
       count++;
     }
     ASSERT_EQ(count, 1);
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
     delete iter;
 
   } while (ChangeOptions());
@@ -1277,12 +1258,10 @@ TEST(DBTest, IterReseek) {
   ASSERT_OK(Put("b",  "bone"));
   Iterator* iter = db_->NewIterator(ReadOptions());
   iter->SeekToFirst();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   ASSERT_EQ(IterStatus(iter), "a->two");
   iter->Next();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   ASSERT_EQ(IterStatus(iter), "b->bone");
   delete iter;
 
@@ -1293,8 +1272,7 @@ TEST(DBTest, IterReseek) {
   iter->SeekToFirst();
   ASSERT_EQ(IterStatus(iter), "a->three");
   iter->Next();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   ASSERT_EQ(IterStatus(iter), "b->bone");
   delete iter;
 
@@ -1304,30 +1282,28 @@ TEST(DBTest, IterReseek) {
   iter = db_->NewIterator(ReadOptions());
   iter->SeekToFirst();
   ASSERT_EQ(IterStatus(iter), "a->four");
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   iter->Next();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
   ASSERT_EQ(IterStatus(iter), "b->bone");
   delete iter;
 
   // Testing reverse iterator
   // At this point, we have three versions of "a" and one version of "b".
   // The reseek statistics is already at 1.
-  int num_reseeks = (int)options.statistics.get()->getTickerCount(
-                 NUMBER_OF_RESEEKS_IN_ITERATION);
+  int num_reseeks =
+      (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION);
 
   // Insert another version of b and assert that reseek is not invoked
   ASSERT_OK(Put("b",  "btwo"));
   iter = db_->NewIterator(ReadOptions());
   iter->SeekToLast();
   ASSERT_EQ(IterStatus(iter), "b->btwo");
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks);
   iter->Prev();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 1);
   ASSERT_EQ(IterStatus(iter), "a->four");
   delete iter;
 
@@ -1338,13 +1314,13 @@ TEST(DBTest, IterReseek) {
   iter = db_->NewIterator(ReadOptions());
   iter->SeekToLast();
   ASSERT_EQ(IterStatus(iter), "b->bfour");
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 2);
   iter->Prev();
 
   // the previous Prev call should have invoked reseek
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 3);
   ASSERT_EQ(IterStatus(iter), "a->four");
   delete iter;
 }
@@ -2107,24 +2083,18 @@ TEST(DBTest, CompressedCache) {
     switch (iter) {
       case 0:
         // only uncompressed block cache
-        ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
-                  0);
-        ASSERT_EQ(options.statistics.get()->getTickerCount
-                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
         break;
       case 1:
         // no block cache, only compressed cache
-        ASSERT_EQ(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
-                  0);
-        ASSERT_GT(options.statistics.get()->getTickerCount
-                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
         break;
       case 2:
         // both compressed and uncompressed block cache
-        ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
-                  0);
-        ASSERT_GT(options.statistics.get()->getTickerCount
-                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
         break;
       default:
         ASSERT_TRUE(false);
@@ -3313,34 +3283,46 @@ TEST(DBTest, ManualCompaction) {
   ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
       << "Need to update this test to match kMaxMemCompactLevel";
 
-  MakeTables(3, "p", "q");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
+
+    // Compaction range falls before files
+    Compact("", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range falls before files
-  Compact("", "c");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+    // Compaction range falls after files
+    Compact("r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range falls after files
-  Compact("r", "z");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+    // Compaction range overlaps files
+    Compact("p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel());
 
-  // Compaction range overlaps files
-  Compact("p1", "p9");
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+    // Populate a different range
+    MakeTables(3, "c", "e");
+    ASSERT_EQ("1,1,2", FilesPerLevel());
 
-  // Populate a different range
-  MakeTables(3, "c", "e");
-  ASSERT_EQ("1,1,2", FilesPerLevel());
+    // Compact just the new range
+    Compact("b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel());
 
-  // Compact just the new range
-  Compact("b", "f");
-  ASSERT_EQ("0,0,2", FilesPerLevel());
+    // Compact all
+    MakeTables(1, "a", "z");
+    ASSERT_EQ("0,1,2", FilesPerLevel());
+    db_->CompactRange(nullptr, nullptr);
+    ASSERT_EQ("0,0,1", FilesPerLevel());
+
+    if (iter == 0) {
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(&options);
+    }
+  }
 
-  // Compact all
-  MakeTables(1, "a", "z");
-  ASSERT_EQ("0,1,2", FilesPerLevel());
-  db_->CompactRange(nullptr, nullptr);
-  ASSERT_EQ("0,0,1", FilesPerLevel());
 }
 
 TEST(DBTest, DBOpen_Options) {
@@ -3401,7 +3383,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
   opts.create_if_missing = false;
   opts.num_levels = 2;
   s = DB::Open(opts, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "Corruption") != nullptr);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
   ASSERT_TRUE(db == nullptr);
 }
 
@@ -4336,6 +4318,70 @@ TEST(DBTest, MultiThreaded) {
   } while (ChangeOptions());
 }
 
+// Group commit test:
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // namespace
+
+TEST(DBTest, GroupCommitTest) {
+  do {
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+
+    for (int id = 0; id < kGCNumThreads; id++) {
+      while (thread[id].done == false) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+  } while (ChangeOptions());
+}
+
 namespace {
 typedef std::map<std::string, std::string> KVMap;
 }
@@ -4903,7 +4949,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   EnvOptions sopt;
   VersionSet vset(dbname, &options, sopt, nullptr, &cmp);
   ASSERT_OK(vset.Recover());
-  VersionEdit vbase(vset.NumberLevels());
+  VersionEdit vbase;
   uint64_t fnum = 1;
   for (int i = 0; i < num_base_files; i++) {
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
@@ -4915,7 +4961,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   uint64_t start_micros = env->NowMicros();
 
   for (int i = 0; i < iters; i++) {
-    VersionEdit vedit(vset.NumberLevels());
+    VersionEdit vedit;
     vedit.DeleteFile(2, fnum);
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
     InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
diff --git a/db/memtable.cc b/db/memtable.cc
index 796ba1b3a..bf2dfa64b 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -20,7 +20,7 @@
 #include "util/coding.h"
 #include "util/mutexlock.h"
 #include "util/murmurhash.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 
 namespace std {
 template <>
@@ -33,24 +33,20 @@ struct hash<rocksdb::Slice> {
 
 namespace rocksdb {
 
-MemTable::MemTable(const InternalKeyComparator& cmp,
-                   MemTableRepFactory* table_factory,
-                   int numlevel,
-                   const Options& options)
+MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
     : comparator_(cmp),
       refs_(0),
       arena_impl_(options.arena_block_size),
-      table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)),
+      table_(options.memtable_factory->CreateMemTableRep(comparator_,
+                                                         &arena_impl_)),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
-      edit_(numlevel),
       first_seqno_(0),
       mem_next_logfile_number_(0),
       mem_logfile_number_(0),
-      locks_(options.inplace_update_support
-             ? options.inplace_update_num_locks
-             : 0) { }
+      locks_(options.inplace_update_support ? options.inplace_update_num_locks
+                                            : 0) {}
 
 MemTable::~MemTable() {
   assert(refs_ == 0);
@@ -58,7 +54,7 @@ MemTable::~MemTable() {
 
 size_t MemTable::ApproximateMemoryUsage() {
   return arena_impl_.ApproximateMemoryUsage() +
-    table_->ApproximateMemoryUsage();
+         table_->ApproximateMemoryUsage();
 }
 
 int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
@@ -89,11 +85,11 @@ class MemTableIterator: public Iterator {
   MemTableIterator(MemTableRep* table, const ReadOptions& options)
     : iter_() {
     if (options.prefix) {
-      iter_ = table->GetPrefixIterator(*options.prefix);
+      iter_.reset(table->GetPrefixIterator(*options.prefix));
     } else if (options.prefix_seek) {
-      iter_ = table->GetDynamicPrefixIterator();
+      iter_.reset(table->GetDynamicPrefixIterator());
     } else {
-      iter_ = table->GetIterator();
+      iter_.reset(table->GetIterator());
     }
   }
 
@@ -114,7 +110,7 @@ class MemTableIterator: public Iterator {
   virtual Status status() const { return Status::OK(); }
 
  private:
-  std::shared_ptr<MemTableRep::Iterator> iter_;
+  std::unique_ptr<MemTableRep::Iterator> iter_;
   std::string tmp_;       // For passing to EncodeKey
 
   // No copying allowed
@@ -165,8 +161,8 @@ void MemTable::Add(SequenceNumber s, ValueType type,
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext& merge_context, const Options& options) {
   Slice memkey = key.memtable_key();
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(key.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
   iter->Seek(memkey.data());
 
   bool merge_in_progress = s->IsMergeInProgress();
@@ -274,8 +270,8 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(lkey.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(lkey.user_key()));
   iter->Seek(memkey.data());
 
   if (iter->Valid()) {
@@ -336,8 +332,8 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
   // A total ordered iterator is costly for some memtablerep (prefix aware
   // reps). By passing in the user key, we allow efficient iterator creation.
   // The iterator only needs to be ordered within the same user key.
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(key.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
   iter->Seek(memkey.data());
 
   size_t num_successive_merges = 0;
diff --git a/db/memtable.h b/db/memtable.h
index 12ccf3d37..1b9005800 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -34,11 +34,8 @@ class MemTable {
 
   // MemTables are reference counted.  The initial reference count
   // is zero and the caller must call Ref() at least once.
-  explicit MemTable(
-    const InternalKeyComparator& comparator,
-    MemTableRepFactory* table_factory,
-    int numlevel = 7,
-    const Options& options = Options());
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const Options& options = Options());
 
   ~MemTable();
 
@@ -146,7 +143,7 @@ class MemTable {
   KeyComparator comparator_;
   int refs_;
   ArenaImpl arena_impl_;
-  shared_ptr<MemTableRep> table_;
+  unique_ptr<MemTableRep> table_;
 
   // These are used to manage memtable flushes to storage
   bool flush_in_progress_; // started the flush
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index a7e2df0a3..e3f3adb1f 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -8,7 +8,7 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 #include <string>
 #include <stdio.h>
 
diff --git a/db/repair.cc b/db/repair.cc
index 6db90c865..29524233f 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -58,7 +58,7 @@ class Repairer {
         next_file_number_(1) {
     // TableCache can be small since we expect each table to be opened once.
     table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
-    edit_ = new VersionEdit(options.num_levels);
+    edit_ = new VersionEdit();
   }
 
   ~Repairer() {
@@ -196,8 +196,7 @@ class Repairer {
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_.memtable_factory.get(),
-      options_.num_levels);
+    MemTable* mem = new MemTable(icmp_, options_);
     mem->Ref();
     int counter = 0;
     while (reader.ReadRecord(&record, &scratch)) {
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index 555d31893..0f3b89d9b 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -17,7 +17,7 @@
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 2fc6fbb65..5de96b887 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -38,6 +38,7 @@ enum Tag {
 
 void VersionEdit::Clear() {
   comparator_.clear();
+  max_level_ = 0;
   log_number_ = 0;
   prev_log_number_ = 0;
   last_sequence_ = 0;
@@ -77,12 +78,6 @@ void VersionEdit::EncodeTo(std::string* dst) const {
     PutVarint64(dst, last_sequence_);
   }
 
-  for (size_t i = 0; i < compact_pointers_.size(); i++) {
-    PutVarint32(dst, kCompactPointer);
-    PutVarint32(dst, compact_pointers_[i].first);  // level
-    PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
-  }
-
   for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
        iter != deleted_files_.end();
        ++iter) {
@@ -131,14 +126,13 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
 
 bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
   uint32_t v;
-  if (GetVarint32(input, &v) &&
-      (int)v < number_levels_) {
+  if (GetVarint32(input, &v)) {
     *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
     return true;
   } else {
-    if ((int)v >= number_levels_) {
-      *msg = "column family already has more levels than specified";
-    }
     return false;
   }
 }
@@ -202,7 +196,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
       case kCompactPointer:
         if (GetLevel(&input, &level, &msg) &&
             GetInternalKey(&input, &key)) {
-          compact_pointers_.push_back(std::make_pair(level, key));
+          // we don't use compact pointers anymore,
+          // but we should not fail if they are still
+          // in manifest
         } else {
           if (!msg) {
             msg = "compaction pointer";
@@ -314,12 +310,6 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append("\n  LastSeq: ");
     AppendNumberTo(&r, last_sequence_);
   }
-  for (size_t i = 0; i < compact_pointers_.size(); i++) {
-    r.append("\n  CompactPointer: ");
-    AppendNumberTo(&r, compact_pointers_[i].first);
-    r.append(" ");
-    r.append(compact_pointers_[i].second.DebugString(hex_key));
-  }
   for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
        iter != deleted_files_.end();
        ++iter) {
diff --git a/db/version_edit.h b/db/version_edit.h
index d79642e2c..b7dfa6d03 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -34,9 +34,7 @@ struct FileMetaData {
 
 class VersionEdit {
  public:
-  explicit VersionEdit(int number_levels) : number_levels_(number_levels) {
-    Clear();
-  }
+  VersionEdit() { Clear(); }
   ~VersionEdit() { }
 
   void Clear();
@@ -61,9 +59,6 @@ class VersionEdit {
     has_last_sequence_ = true;
     last_sequence_ = seq;
   }
-  void SetCompactPointer(int level, const InternalKey& key) {
-    compact_pointers_.push_back(std::make_pair(level, key));
-  }
 
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
@@ -128,7 +123,7 @@ class VersionEdit {
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
-  int number_levels_;
+  int max_level_;
   std::string comparator_;
   uint64_t log_number_;
   uint64_t prev_log_number_;
@@ -140,7 +135,6 @@ class VersionEdit {
   bool has_next_file_number_;
   bool has_last_sequence_;
 
-  std::vector< std::pair<int, InternalKey> > compact_pointers_;
   DeletedFileSet deleted_files_;
   std::vector< std::pair<int, FileMetaData> > new_files_;
 
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 491fabb89..83d7fc9b3 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -15,7 +15,7 @@ namespace rocksdb {
 static void TestEncodeDecode(const VersionEdit& edit) {
   std::string encoded, encoded2;
   edit.EncodeTo(&encoded);
-  VersionEdit parsed(7);
+  VersionEdit parsed;
   Status s = parsed.DecodeFrom(encoded);
   ASSERT_TRUE(s.ok()) << s.ToString();
   parsed.EncodeTo(&encoded2);
@@ -27,7 +27,7 @@ class VersionEditTest { };
 TEST(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
 
-  VersionEdit edit(7);
+  VersionEdit edit;
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
     edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
@@ -36,7 +36,6 @@ TEST(VersionEditTest, EncodeDecode) {
                  kBig + 500 + i,
                  kBig + 600 + i);
     edit.DeleteFile(4, kBig + 700 + i);
-    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
   }
 
   edit.SetComparatorName("foo");
@@ -47,7 +46,7 @@ TEST(VersionEditTest, EncodeDecode) {
 }
 
 TEST(VersionEditTest, ColumnFamilyTest) {
-  VersionEdit edit(7);
+  VersionEdit edit;
   edit.SetColumnFamily(2);
   edit.AddColumnFamily("column_family");
   TestEncodeDecode(edit);
diff --git a/db/version_set.cc b/db/version_set.cc
index ad1169189..f9d04bf37 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -18,6 +18,7 @@
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
@@ -45,7 +46,7 @@ Version::~Version() {
   next_->prev_ = prev_;
 
   // Drop references to files
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     for (size_t i = 0; i < files_[level].size(); i++) {
       FileMetaData* f = files_[level][i];
       assert(f->refs > 0);
@@ -265,7 +266,7 @@ void Version::AddIterators(const ReadOptions& options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < vset_->NumberLevels(); level++) {
+  for (int level = 1; level < num_levels_; level++) {
     if (!files_[level].empty()) {
       iters->push_back(NewConcatenatingIterator(options, soptions, level));
     }
@@ -407,16 +408,19 @@ static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
 }
 
 Version::Version(VersionSet* vset, uint64_t version_number)
-    : vset_(vset), next_(this), prev_(this), refs_(0),
-      files_(new std::vector<FileMetaData*>[vset->NumberLevels()]),
-      files_by_size_(vset->NumberLevels()),
-      next_file_to_compact_by_size_(vset->NumberLevels()),
+    : vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      num_levels_(vset->num_levels_),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      files_by_size_(num_levels_),
+      next_file_to_compact_by_size_(num_levels_),
       file_to_compact_(nullptr),
       file_to_compact_level_(-1),
-      compaction_score_(vset->NumberLevels()),
-      compaction_level_(vset->NumberLevels()),
-      version_number_(version_number) {
-}
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      version_number_(version_number) {}
 
 void Version::Get(const ReadOptions& options,
                   const LookupKey& k,
@@ -455,7 +459,7 @@ void Version::Get(const ReadOptions& options,
   // levels.  Therefore we are guaranteed that if we find data
   // in an smaller level, later levels are irrelevant (unless we
   // are MergeInProgress).
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     size_t num_files = files_[level].size();
     if (num_files == 0) continue;
 
@@ -589,6 +593,159 @@ bool Version::UpdateStats(const GetStats& stats) {
   return false;
 }
 
+void Version::Finalize(std::vector<uint64_t>& size_being_compacted) {
+  // Pre-sort level0 for Get()
+  if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirstBySeqNo);
+  } else {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirst);
+  }
+
+  double max_score = 0;
+  int max_score_level = 0;
+
+  int num_levels_to_check =
+      (vset_->options_->compaction_style != kCompactionStyleUniversal)
+          ? NumberLevels() - 1
+          : 1;
+
+  for (int level = 0; level < num_levels_to_check; level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int numfiles = 0;
+      for (unsigned int i = 0; i < files_[level].size(); i++) {
+        if (!files_[level][i]->being_compacted) {
+          numfiles++;
+        }
+      }
+
+      // If we are slowing down writes, then we better compact that first
+      if (numfiles >= vset_->options_->level0_stop_writes_trigger) {
+        score = 1000000;
+        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
+      } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) {
+        score = 10000;
+        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
+      } else {
+        score = static_cast<double>(numfiles) /
+                vset_->options_->level0_file_num_compaction_trigger;
+        if (score >= 1) {
+          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes =
+          TotalFileSize(files_[level]) - size_being_compacted[level];
+      score = static_cast<double>(level_bytes) / vset_->MaxBytesForLevel(level);
+      if (score > 1) {
+        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
+      }
+      if (max_score < score) {
+        max_score = score;
+        max_score_level = level;
+      }
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // update the max compaction score in levels 1 to n-1
+  max_compaction_score_ = max_score;
+  max_compaction_score_level_ = max_score_level;
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < NumberLevels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+}
+
+namespace {
+
+// Compator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareSizeDescending(const Version::Fsize& first,
+                           const Version::Fsize& second) {
+  return (first.file->file_size > second.file->file_size);
+}
+// A static compator used to sort files based on their seqno
+// In universal style : descending seqno
+bool CompareSeqnoDescending(const Version::Fsize& first,
+                            const Version::Fsize& second) {
+  if (first.file->smallest_seqno > second.file->smallest_seqno) {
+    assert(first.file->largest_seqno > second.file->largest_seqno);
+    return true;
+  }
+  assert(first.file->largest_seqno <= second.file->largest_seqno);
+  return false;
+}
+
+}  // anonymous namespace
+
+void Version::UpdateFilesBySize() {
+  // No need to sort the highest level because it is never compacted.
+  int max_level =
+      (vset_->options_->compaction_style == kCompactionStyleUniversal)
+          ? NumberLevels()
+          : NumberLevels() - 1;
+
+  for (int level = 0; level < max_level; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    std::vector<int>& files_by_size = files_by_size_[level];
+    assert(files_by_size.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (unsigned int i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+      int num = temp.size();
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSeqnoDescending);
+    } else {
+      int num = Version::number_of_files_to_sort_;
+      if (num > (int)temp.size()) {
+        num = temp.size();
+      }
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSizeDescending);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_size_
+    for (unsigned int i = 0; i < temp.size(); i++) {
+      files_by_size.push_back(temp[i].index);
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_size_[level].size());
+  }
+}
+
 void Version::Ref() {
   ++refs_;
 }
@@ -626,13 +783,13 @@ int Version::PickLevelForMemTableOutput(
       if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
         break;
       }
-      if (level + 2 >= vset_->NumberLevels()) {
+      if (level + 2 >= num_levels_) {
         level++;
         break;
       }
       GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
       const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > vset_->MaxGrandParentOverlapBytes(level)) {
+      if (sum > vset_->compaction_picker_->MaxGrandParentOverlapBytes(level)) {
         break;
       }
       level++;
@@ -858,9 +1015,70 @@ bool Version::HasOverlappingUserKey(
   return false;
 }
 
+int64_t Version::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
+  for (int i = 0; i < NumberLevels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
+                                      int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%lu(seq=%lu,sz=%lu,%lu) ",
+                       (unsigned long)f->number,
+                       (unsigned long)f->smallest_seqno,
+                       (unsigned long)f->file_size,
+                       (unsigned long)f->being_compacted);
+    if (ret < 0 || ret >= sz)
+      break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+int64_t Version::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < NumberLevels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+void Version::AddLiveFiles(std::set<uint64_t>* live) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (const auto& file : files) {
+      live->insert(file->number);
+    }
+  }
+}
+
 std::string Version::DebugString(bool hex) const {
   std::string r;
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     // E.g.,
     //   --- level 1 ---
     //   17:123['a' .. 'd']
@@ -929,20 +1147,18 @@ class VersionSet::Builder {
 
  public:
   // Initialize a builder with the files from *base and other info from *vset
-  Builder(VersionSet* vset, Version* base)
-      : vset_(vset),
-        base_(base) {
+  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
     base_->Ref();
-    levels_ = new LevelState[vset_->NumberLevels()];
+    levels_ = new LevelState[base->NumberLevels()];
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base->NumberLevels(); level++) {
       levels_[level].added_files = new FileSet(cmp);
     }
   }
 
   ~Builder() {
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       const FileSet* added = levels_[level].added_files;
       std::vector<FileMetaData*> to_unref;
       to_unref.reserve(added->size());
@@ -965,7 +1181,7 @@ class VersionSet::Builder {
 
   void CheckConsistency(Version* v) {
 #ifndef NDEBUG
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       // Make sure there is no overlap in levels > 0
       if (level > 0) {
         for (uint32_t i = 1; i < v->files_[level].size(); i++) {
@@ -983,14 +1199,12 @@ class VersionSet::Builder {
 #endif
   }
 
-  void CheckConsistencyForDeletes(
-    VersionEdit* edit,
-    unsigned int number,
-    int level) {
+  void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number,
+                                  int level) {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      for (int l = 0; !found && l < vset_->NumberLevels(); l++) {
+      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
         const std::vector<FileMetaData*>& base_files = base_->files_[l];
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
@@ -1003,7 +1217,7 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level+1; !found && l < vset_->NumberLevels(); l++) {
+      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1035,13 +1249,6 @@ class VersionSet::Builder {
   void Apply(VersionEdit* edit) {
     CheckConsistency(base_);
 
-    // Update compaction pointers
-    for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
-      const int level = edit->compact_pointers_[i].first;
-      vset_->compact_pointer_[level] =
-          edit->compact_pointers_[i].second.Encode().ToString();
-    }
-
     // Delete files
     const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
     for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
@@ -1086,7 +1293,7 @@ class VersionSet::Builder {
     CheckConsistency(v);
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
       const std::vector<FileMetaData*>& base_files = base_->files_[level];
@@ -1133,8 +1340,7 @@ class VersionSet::Builder {
   }
 };
 
-VersionSet::VersionSet(const std::string& dbname,
-                       const Options* options,
+VersionSet::VersionSet(const std::string& dbname, const Options* options,
                        const EnvOptions& storage_options,
                        TableCache* table_cache,
                        const InternalKeyComparator* cmp)
@@ -1149,13 +1355,16 @@ VersionSet::VersionSet(const std::string& dbname,
       log_number_(0),
       prev_log_number_(0),
       num_levels_(options_->num_levels),
-      compactions_in_progress_(options_->num_levels),
+      need_slowdown_for_num_level0_files_(false),
       current_version_number_(0),
       manifest_file_size_(0),
       storage_options_(storage_options),
-      storage_options_compactions_(storage_options_)  {
-  compact_pointer_ = new std::string[options_->num_levels];
-  Init(options_->num_levels);
+      storage_options_compactions_(storage_options_) {
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    compaction_picker_.reset(new UniversalCompactionPicker(options_, &icmp_));
+  } else {
+    compaction_picker_.reset(new LevelCompactionPicker(options_, &icmp_));
+  }
 }
 
 VersionSet::~VersionSet() {
@@ -1169,29 +1378,6 @@ VersionSet::~VersionSet() {
     delete file;
   }
   obsolete_files_.clear();
-  delete[] compact_pointer_;
-  delete[] max_file_size_;
-  delete[] level_max_bytes_;
-}
-
-void VersionSet::Init(int num_levels) {
-  max_file_size_ = new uint64_t[num_levels];
-  level_max_bytes_ = new uint64_t[num_levels];
-  int target_file_size_multiplier = options_->target_file_size_multiplier;
-  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
-  for (int i = 0; i < num_levels; i++) {
-    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
-      max_file_size_[i] = ULLONG_MAX;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    } else if (i > 1) {
-      max_file_size_[i] = max_file_size_[i-1] * target_file_size_multiplier;
-      level_max_bytes_[i] = level_max_bytes_[i-1] * max_bytes_multiplier *
-        options_->max_bytes_for_level_multiplier_additional[i-1];
-    } else {
-      max_file_size_[i] = options_->target_file_size_base;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    }
-  }
 }
 
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
@@ -1204,6 +1390,9 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
     column_family_data->current->Unref();
   }
   column_family_data->current = v;
+  need_slowdown_for_num_level0_files_ =
+      (options_->level0_slowdown_writes_trigger >= 0 &&
+       v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
 
   // Append to linked list
@@ -1269,8 +1458,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   // because &w is ensuring that all new writes get queued.
   {
     // calculate the amount of data being compacted at every level
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
-    SizeBeingCompacted(size_being_compacted);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+    compaction_picker_->SizeBeingCompacted(size_being_compacted);
 
     mu->Unlock();
 
@@ -1288,8 +1477,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
     // The calls to Finalize and UpdateFilesBySize are cpu-heavy
     // and is best called outside the mutex.
-    Finalize(v, size_being_compacted);
-    UpdateFilesBySize(v);
+    v->Finalize(size_being_compacted);
+    v->UpdateFilesBySize();
 
     // Write new record to MANIFEST log
     if (s.ok()) {
@@ -1382,10 +1571,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   return s;
 }
 
-void VersionSet::LogAndApplyHelper(Builder* builder,
-                                   Version* v,
-                                   VersionEdit* edit,
-                                   port::Mutex* mu) {
+void VersionSet::LogAndApplyHelper(Builder* builder, Version* v,
+                                   VersionEdit* edit, port::Mutex* mu) {
   mu->AssertHeld();
 
   if (edit->has_log_number_) {
@@ -1450,7 +1637,7 @@ Status VersionSet::Recover() {
   std::unordered_map<uint32_t, Builder*> builders;
 
   // add default column family
-  VersionEdit default_cf_edit(0);
+  VersionEdit default_cf_edit;
   default_cf_edit.AddColumnFamily(default_column_family_name);
   default_cf_edit.SetColumnFamily(0);
   ColumnFamilyData* default_cfd =
@@ -1465,20 +1652,18 @@ Status VersionSet::Recover() {
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
-      if (s.ok()) {
-        if (edit.has_comparator_ &&
-            edit.comparator_ != icmp_.user_comparator()->Name()) {
-          s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
-                                      "does not match existing comparator " +
-                                      edit.comparator_);
-        }
-      }
-
       if (!s.ok()) {
         break;
       }
+      if (edit.has_comparator_ &&
+          edit.comparator_ != icmp_.user_comparator()->Name()) {
+        s = Status::InvalidArgument(
+            icmp_.user_comparator()->Name(),
+            "does not match existing comparator " + edit.comparator_);
+        break;
+      }
 
       if (edit.is_column_family_add_) {
         ColumnFamilyData* new_cfd =
@@ -1492,6 +1677,14 @@ Status VersionSet::Recover() {
         builders.erase(builder);
         DropColumnFamily(&edit);
       } else {
+        auto cfd = column_family_data_.find(edit.column_family_);
+        assert(cfd != column_family_data_.end());
+        if (edit.max_level_ >= cfd->second->current->NumberLevels()) {
+          s = Status::InvalidArgument(
+              "db has more levels than options.num_levels");
+          break;
+        }
+
         // if it isn't column family add or column family drop,
         // then it's a file add/delete, which should be forwarded
         // to builder
@@ -1546,9 +1739,9 @@ Status VersionSet::Recover() {
       builders[cfd.first]->SaveTo(v);
 
       // Install recovered version
-      std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
-      SizeBeingCompacted(size_being_compacted);
-      Finalize(v, size_being_compacted);
+      std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+      compaction_picker_->SizeBeingCompacted(size_being_compacted);
+      v->Finalize(size_being_compacted);
       AppendVersion(cfd.second, v);
     }
 
@@ -1578,10 +1771,8 @@ Status VersionSet::Recover() {
   return s;
 }
 
-Status VersionSet::DumpManifest(Options& options,
-                                std::string& dscname,
-                                bool verbose,
-                                bool hex) {
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+                                bool verbose, bool hex) {
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
     virtual void Corruption(size_t bytes, const Status& s) {
@@ -1617,7 +1808,7 @@ Status VersionSet::DumpManifest(Options& options,
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
       if (s.ok()) {
         if (edit.has_comparator_ &&
@@ -1711,285 +1902,64 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
   }
 }
 
-void VersionSet::Finalize(Version* v,
-  std::vector<uint64_t>& size_being_compacted) {
-  // Pre-sort level0 for Get()
-  if (options_->compaction_style == kCompactionStyleUniversal) {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
-  } else {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
-  }
-
-  double max_score = 0;
-  int max_score_level = 0;
-
-  int num_levels_to_check =
-      (options_->compaction_style != kCompactionStyleUniversal) ?
-          NumberLevels() - 1 : 1;
-
-  for (int level = 0; level < num_levels_to_check; level++) {
+Status VersionSet::WriteSnapshot(log::Writer* log) {
+  // TODO: Break up into multiple records to reduce memory usage on recovery?
 
-    double score;
-    if (level == 0) {
-      // We treat level-0 specially by bounding the number of files
-      // instead of number of bytes for two reasons:
-      //
-      // (1) With larger write-buffer sizes, it is nice not to do too
-      // many level-0 compactions.
-      //
-      // (2) The files in level-0 are merged on every read and
-      // therefore we wish to avoid too many files when the individual
-      // file size is small (perhaps because of a small write-buffer
-      // setting, or very high compression ratios, or lots of
-      // overwrites/deletions).
-      int numfiles = 0;
-      for (unsigned int i = 0; i < v->files_[level].size(); i++) {
-        if (!v->files_[level][i]->being_compacted) {
-          numfiles++;
+  for (auto cfd : column_family_data_) {
+    {
+      // Store column family info
+      VersionEdit edit;
+      if (cfd.first != 0) {
+        // default column family is always there,
+        // no need to explicitly write it
+        edit.AddColumnFamily(cfd.second->name);
+        edit.SetColumnFamily(cfd.first);
+        std::string record;
+        edit.EncodeTo(&record);
+        Status s = log->AddRecord(record);
+        if (!s.ok()) {
+          return s;
         }
       }
+    }
 
-      // If we are slowing down writes, then we better compact that first
-      if (numfiles >= options_->level0_stop_writes_trigger) {
-        score = 1000000;
-        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
-      } else if (numfiles >= options_->level0_slowdown_writes_trigger) {
-        score = 10000;
-        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
-      } else {
-        score = numfiles /
-          static_cast<double>(options_->level0_file_num_compaction_trigger);
-        if (score >= 1) {
-          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
+    {
+      // Save files
+      VersionEdit edit;
+      edit.SetColumnFamily(cfd.first);
+
+      for (int level = 0; level < NumberLevels(); level++) {
+        const std::vector<FileMetaData*>& files =
+            cfd.second->current->files_[level];
+        for (size_t i = 0; i < files.size(); i++) {
+          const FileMetaData* f = files[i];
+          edit.AddFile(level,
+                       f->number,
+                       f->file_size,
+                       f->smallest,
+                       f->largest,
+                       f->smallest_seqno,
+                       f->largest_seqno);
         }
       }
-    } else {
-      // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes = TotalFileSize(v->files_[level]) -
-                                   size_being_compacted[level];
-      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
-      if (score > 1) {
-        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
-      }
-      if (max_score < score) {
-        max_score = score;
-        max_score_level = level;
-      }
-    }
-    v->compaction_level_[level] = level;
-    v->compaction_score_[level] = score;
-  }
-
-  // update the max compaction score in levels 1 to n-1
-  v->max_compaction_score_ = max_score;
-  v->max_compaction_score_level_ = max_score_level;
-
-  // sort all the levels based on their score. Higher scores get listed
-  // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i <  NumberLevels()-2; i++) {
-    for (int j = i+1; j < NumberLevels()-1; j++) {
-      if (v->compaction_score_[i] < v->compaction_score_[j]) {
-        double score = v->compaction_score_[i];
-        int level = v->compaction_level_[i];
-        v->compaction_score_[i] = v->compaction_score_[j];
-        v->compaction_level_[i] = v->compaction_level_[j];
-        v->compaction_score_[j] = score;
-        v->compaction_level_[j] = level;
-      }
-    }
-  }
-}
-
-// A static compator used to sort files based on their size
-// In normal mode: descending size
-static bool compareSizeDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  return (first.file->file_size > second.file->file_size);
-}
-// A static compator used to sort files based on their seqno
-// In universal style : descending seqno
-static bool compareSeqnoDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  if (first.file->smallest_seqno > second.file->smallest_seqno) {
-    assert(first.file->largest_seqno > second.file->largest_seqno);
-    return true;
-  }
-  assert(first.file->largest_seqno <= second.file->largest_seqno);
-  return false;
-}
-
-// sort all files in level1 to level(n-1) based on file size
-void VersionSet::UpdateFilesBySize(Version* v) {
-
-  // No need to sort the highest level because it is never compacted.
-  int max_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  NumberLevels() : NumberLevels() - 1;
-
-  for (int level = 0; level < max_level; level++) {
-
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    std::vector<int>& files_by_size = v->files_by_size_[level];
-    assert(files_by_size.size() == 0);
-
-    // populate a temp vector for sorting based on size
-    std::vector<Fsize> temp(files.size());
-    for (unsigned int i = 0; i < files.size(); i++) {
-      temp[i].index = i;
-      temp[i].file = files[i];
-    }
-
-    // sort the top number_of_files_to_sort_ based on file size
-    if (options_->compaction_style == kCompactionStyleUniversal) {
-      int num = temp.size();
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSeqnoDescending);
-    } else {
-      int num = Version::number_of_files_to_sort_;
-      if (num > (int)temp.size()) {
-        num = temp.size();
-      }
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSizeDescending);
-    }
-    assert(temp.size() == files.size());
-
-    // initialize files_by_size_
-    for (unsigned int i = 0; i < temp.size(); i++) {
-      files_by_size.push_back(temp[i].index);
-    }
-    v->next_file_to_compact_by_size_[level] = 0;
-    assert(v->files_[level].size() == v->files_by_size_[level].size());
-  }
-}
-
-Status VersionSet::WriteSnapshot(log::Writer* log) {
-  // TODO: Break up into multiple records to reduce memory usage on recovery?
-
-  for (auto cfd : column_family_data_) {
-    {
-      // Store column family info
-      VersionEdit edit(0);
-      if (cfd.first != 0) {
-        // default column family is always there,
-        // no need to explicitly write it
-        edit.AddColumnFamily(cfd.second->name);
-        edit.SetColumnFamily(cfd.first);
-        std::string record;
-        edit.EncodeTo(&record);
-        Status s = log->AddRecord(record);
-        if (!s.ok()) {
-          return s;
-        }
-      }
-    }
-
-    {
-      // Save files
-      VersionEdit edit(NumberLevels());
-      edit.SetColumnFamily(cfd.first);
-
-      for (int level = 0; level < NumberLevels(); level++) {
-        const std::vector<FileMetaData*>& files =
-            cfd.second->current->files_[level];
-        for (size_t i = 0; i < files.size(); i++) {
-          const FileMetaData* f = files[i];
-          edit.AddFile(level,
-                       f->number,
-                       f->file_size,
-                       f->smallest,
-                       f->largest,
-                       f->smallest_seqno,
-                       f->largest_seqno);
-        }
-      }
-      std::string record;
-      edit.EncodeTo(&record);
-      Status s = log->AddRecord(record);
-      if (!s.ok()) {
-        return s;
+      std::string record;
+      edit.EncodeTo(&record);
+      Status s = log->AddRecord(record);
+      if (!s.ok()) {
+        return s;
       }
     }
   }
 
   // Save metadata
-  VersionEdit edit(NumberLevels());
+  VersionEdit edit;
   edit.SetComparatorName(icmp_.user_comparator()->Name());
 
-  // Save compaction pointers
-  for (int level = 0; level < NumberLevels(); level++) {
-    if (!compact_pointer_[level].empty()) {
-      InternalKey key;
-      key.DecodeFrom(compact_pointer_[level]);
-      edit.SetCompactPointer(level, key);
-    }
-  }
-
   std::string record;
   edit.EncodeTo(&record);
   return log->AddRecord(record);
 }
 
-int VersionSet::NumLevelFiles(int level) const {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  // TODO this only works for default column family now
-  assert(column_family_data_.find(0) != column_family_data_.end());
-  Version* version = column_family_data_.find(0)->second->current;
-  return version->files_[level].size();
-}
-
-const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%d ",
-        int(version->files_[i].size()));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelDataSizeSummary(
-    LevelSummaryStorage* scratch) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (int i = 0; i < NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%lu ",
-        (unsigned long)NumLevelBytes(i));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelFileSummary(Version* v,
-                                         FileSummaryStorage* scratch,
-                                         int level) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (unsigned int i = 0; i < v->files_[level].size(); i++) {
-    FileMetaData* f = v->files_[level][i];
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz,
-                       "#%lu(seq=%lu,sz=%lu,%lu) ",
-                       (unsigned long)f->number,
-                       (unsigned long)f->smallest_seqno,
-                       (unsigned long)f->file_size,
-                       (unsigned long)f->being_compacted);
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
 // Opens the mainfest file and reads all records
 // till it finds the record we are looking for.
 bool VersionSet::ManifestContains(const std::string& record) const {
@@ -2021,7 +1991,7 @@ bool VersionSet::ManifestContains(const std::string& record) const {
 
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < v->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = v->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
@@ -2057,9 +2027,8 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   int64_t total_files = 0;
   for (auto cfd : column_family_data_) {
     for (Version* v = cfd.second->dummy_versions.next_;
-         v != &cfd.second->dummy_versions;
-         v = v->next_) {
-      for (int level = 0; level < NumberLevels(); level++) {
+         v != &cfd.second->dummy_versions; v = v->next_) {
+      for (int level = 0; level < v->NumberLevels(); level++) {
         total_files += v->files_[level].size();
       }
     }
@@ -2070,9 +2039,8 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
 
   for (auto cfd : column_family_data_) {
     for (Version* v = cfd.second->dummy_versions.next_;
-         v != &cfd.second->dummy_versions;
-         v = v->next_) {
-      for (int level = 0; level < NumberLevels(); level++) {
+         v != &cfd.second->dummy_versions; v = v->next_) {
+      for (int level = 0; level < v->NumberLevels(); level++) {
         for (const auto& f : v->files_[level]) {
           live_list->push_back(f->number);
         }
@@ -2081,80 +2049,20 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   }
 }
 
-void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
-  // TODO this only works for default column family now
-  Version* v = column_family_data_.find(0)->second->current;
-  for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    for (size_t i = 0; i < files.size(); i++) {
-      live->insert(files[i]->number);
-    }
-  }
-}
-
-int64_t VersionSet::NumLevelBytes(int level) const {
+Compaction* VersionSet::PickCompaction() {
   // TODO this only works for default column family now
   Version* version = column_family_data_.find(0)->second->current;
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  assert(version);
-  return TotalFileSize(version->files_[level]);
+  return compaction_picker_->PickCompaction(version);
 }
 
-int64_t VersionSet::MaxNextLevelOverlappingBytes() {
+Compaction* VersionSet::CompactRange(int input_level, int output_level,
+                                     const InternalKey* begin,
+                                     const InternalKey* end,
+                                     InternalKey** compaction_end) {
   // TODO this only works for default column family now
   Version* version = column_family_data_.find(0)->second->current;
-  uint64_t result = 0;
-  std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < NumberLevels() - 1; level++) {
-    for (size_t i = 0; i < version->files_[level].size(); i++) {
-      const FileMetaData* f = version->files_[level][i];
-      version->GetOverlappingInputs(
-          level + 1, &f->smallest, &f->largest, &overlaps);
-      const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > result) {
-        result = sum;
-      }
-    }
-  }
-  return result;
-}
-
-// Stores the minimal range that covers all entries in inputs in
-// *smallest, *largest.
-// REQUIRES: inputs is not empty
-void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
-                          InternalKey* smallest,
-                          InternalKey* largest) {
-  assert(!inputs.empty());
-  smallest->Clear();
-  largest->Clear();
-  for (size_t i = 0; i < inputs.size(); i++) {
-    FileMetaData* f = inputs[i];
-    if (i == 0) {
-      *smallest = f->smallest;
-      *largest = f->largest;
-    } else {
-      if (icmp_.Compare(f->smallest, *smallest) < 0) {
-        *smallest = f->smallest;
-      }
-      if (icmp_.Compare(f->largest, *largest) > 0) {
-        *largest = f->largest;
-      }
-    }
-  }
-}
-
-// Stores the minimal range that covers all entries in inputs1 and inputs2
-// in *smallest, *largest.
-// REQUIRES: inputs is not empty
-void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
-                           const std::vector<FileMetaData*>& inputs2,
-                           InternalKey* smallest,
-                           InternalKey* largest) {
-  std::vector<FileMetaData*> all = inputs1;
-  all.insert(all.end(), inputs2.begin(), inputs2.end());
-  GetRange(all, smallest, largest);
+  return compaction_picker_->CompactRange(version, input_level, output_level,
+                                          begin, end, compaction_end);
 }
 
 Iterator* VersionSet::MakeInputIterator(Compaction* c) {
@@ -2194,29 +2102,11 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
 }
 
 double VersionSet::MaxBytesForLevel(int level) {
-  // Note: the result for level zero is not really used since we set
-  // the level-0 compaction threshold based on number of files.
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return level_max_bytes_[level];
+  return compaction_picker_->MaxBytesForLevel(level);
 }
 
 uint64_t VersionSet::MaxFileSizeForLevel(int level) {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return max_file_size_[level];
-}
-
-uint64_t VersionSet::ExpandedCompactionByteSizeLimit(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->expanded_compaction_factor;
-  return result;
-}
-
-uint64_t VersionSet::MaxGrandParentOverlapBytes(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->max_grandparent_overlap_factor;
-  return result;
+  return compaction_picker_->MaxFileSizeForLevel(level);
 }
 
 // verify that the files listed in this compaction are present
@@ -2269,737 +2159,16 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
   return true;     // everything good
 }
 
-// Clear all files to indicate that they are not being compacted
-// Delete this compaction from the list of running compactions.
 void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) {
-  c->MarkFilesBeingCompacted(false);
-  compactions_in_progress_[c->level()].erase(c);
-  if (!status.ok()) {
-    c->ResetNextCompactionIndex();
-  }
-}
-
-// The total size of files that are currently being compacted
-// at at every level upto the penultimate level.
-void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
-  for (int level = 0; level < NumberLevels()-1; level++) {
-    uint64_t total = 0;
-    for (std::set<Compaction*>::iterator it =
-         compactions_in_progress_[level].begin();
-         it != compactions_in_progress_[level].end();
-         ++it) {
-      Compaction* c = (*it);
-      assert(c->level() == level);
-      for (int i = 0; i < c->num_input_files(0); i++) {
-        total += c->input(0,i)->file_size;
-      }
-    }
-    sizes[level] = total;
-  }
-}
-
-//
-// Look at overall size amplification. If size amplification
-// exceeeds the configured value, then do a compaction
-// of the candidate files all the way upto the earliest
-// base file (overrides configured values of file-size ratios,
-// min_merge_width and max_merge_width).
-//
-Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level,
-                                                       double score) {
-  assert (level == 0);
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-
-  // percentage flexibilty while reducing size amplification
-  uint64_t ratio = options_->compaction_options_universal.
-                     max_size_amplification_percent;
-
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = version->files_by_size_[level];
-  assert(file_by_time.size() == version->files_[level].size());
-
-  unsigned int candidate_count = 0;
-  uint64_t candidate_size = 0;
-  unsigned int start_index = 0;
-  FileMetaData* f = nullptr;
-
-  // Skip files that are already being compacted
-  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
-    int index = file_by_time[loop];
-    f = version->files_[level][index];
-    if (!f->being_compacted) {
-      start_index = loop;         // Consider this as the first candidate.
-      break;
-    }
-    Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s",
-        (unsigned long)f->number,
-        loop,
-        " cannot be a candidate to reduce size amp.\n");
-    f = nullptr;
-  }
-  if (f == nullptr) {
-    return nullptr;             // no candidate files
-  }
-
-  Log(options_->info_log, "Universal: First candidate file %lu[%d] %s",
-      (unsigned long)f->number,
-      start_index,
-      " to reduce size amp.\n");
-
-  // keep adding up all the remaining files
-  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
-       loop++) {
-    int index = file_by_time[loop];
-    f = version->files_[level][index];
-    if (f->being_compacted) {
-      Log(options_->info_log,
-          "Universal: Possible candidate file %lu[%d] %s.",
-          (unsigned long)f->number,
-          loop,
-          " is already being compacted. No size amp reduction possible.\n");
-      return nullptr;
-    }
-    candidate_size += f->file_size;
-    candidate_count++;
-  }
-  if (candidate_count == 0) {
-    return nullptr;
-  }
-
-  // size of earliest file
-  int index = file_by_time[file_by_time.size() - 1];
-  uint64_t earliest_file_size = version->files_[level][index]->file_size;
-
-  // size amplification = percentage of additional size
-  if (candidate_size * 100 < ratio * earliest_file_size) {
-    Log(options_->info_log,
-        "Universal: size amp not needed. newer-files-total-size %lu "
-        "earliest-file-size %lu",
-        (unsigned long)candidate_size,
-        (unsigned long)earliest_file_size);
-    return nullptr;
-  } else {
-    Log(options_->info_log,
-        "Universal: size amp needed. newer-files-total-size %lu "
-        "earliest-file-size %lu",
-        (unsigned long)candidate_size,
-        (unsigned long)earliest_file_size);
-  }
-  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
-
-  // create a compaction request
-  // We always compact all the files, so always compress.
-  Compaction* c = new Compaction(level,
-                                 level,
-                                 MaxFileSizeForLevel(level),
-                                 LLONG_MAX,
-                                 NumberLevels(),
-                                 version,
-                                 false,
-                                 true);
-  c->score_ = score;
-  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
-    int index = file_by_time[loop];
-    f = version->files_[level][index];
-    c->inputs_[0].push_back(f);
-    Log(options_->info_log,
-        "Universal: size amp picking file %lu[%d] with size %lu",
-        (unsigned long)f->number,
-        index,
-        (unsigned long)f->file_size);
-  }
-  return c;
-}
-
-//
-// Consider compaction files based on their size differences with
-// the next file in time order.
-//
-Compaction* VersionSet::PickCompactionUniversalReadAmp(
-    int level, double score, unsigned int ratio,
-    unsigned int max_number_of_files_to_compact) {
-
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-
-  unsigned int min_merge_width =
-    options_->compaction_options_universal.min_merge_width;
-  unsigned int max_merge_width =
-    options_->compaction_options_universal.max_merge_width;
-
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = version->files_by_size_[level];
-  FileMetaData* f = nullptr;
-  bool done = false;
-  int start_index = 0;
-  unsigned int candidate_count;
-  assert(file_by_time.size() == version->files_[level].size());
-
-  unsigned int max_files_to_compact = std::min(max_merge_width,
-                                       max_number_of_files_to_compact);
-  min_merge_width = std::max(min_merge_width, 2U);
-
-  // Considers a candidate file only if it is smaller than the
-  // total size accumulated so far.
-  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
-
-    candidate_count = 0;
-
-    // Skip files that are already being compacted
-    for (f = nullptr; loop < file_by_time.size(); loop++) {
-      int index = file_by_time[loop];
-      f = version->files_[level][index];
-
-      if (!f->being_compacted) {
-        candidate_count = 1;
-        break;
-      }
-      Log(options_->info_log,
-          "Universal: file %lu[%d] being compacted, skipping",
-          (unsigned long)f->number, loop);
-      f = nullptr;
-    }
-
-    // This file is not being compacted. Consider it as the
-    // first candidate to be compacted.
-    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
-    if (f != nullptr) {
-      Log(options_->info_log, "Universal: Possible candidate file %lu[%d].",
-          (unsigned long)f->number, loop);
-    }
-
-    // Check if the suceeding files need compaction.
-    for (unsigned int i = loop+1;
-         candidate_count < max_files_to_compact && i < file_by_time.size();
-         i++) {
-      int index = file_by_time[i];
-      FileMetaData* f = version->files_[level][index];
-      if (f->being_compacted) {
-        break;
-      }
-      // pick files if the total candidate file size (increased by the
-      // specified ratio) is still larger than the next candidate file.
-      uint64_t sz = (candidate_size * (100L + ratio)) /100;
-      if (sz < f->file_size) {
-        break;
-      }
-      candidate_count++;
-      candidate_size += f->file_size;
-    }
-
-    // Found a series of consecutive files that need compaction.
-    if (candidate_count >= (unsigned int)min_merge_width) {
-      start_index = loop;
-      done = true;
-      break;
-    } else {
-      for (unsigned int i = loop;
-           i < loop + candidate_count && i < file_by_time.size(); i++) {
-       int index = file_by_time[i];
-       FileMetaData* f = version->files_[level][index];
-       Log(options_->info_log,
-           "Universal: Skipping file %lu[%d] with size %lu %d\n",
-           (unsigned long)f->number,
-           i,
-           (unsigned long)f->file_size,
-           f->being_compacted);
-      }
-    }
-  }
-  if (!done || candidate_count <= 1) {
-    return nullptr;
-  }
-  unsigned int first_index_after = start_index + candidate_count;
-  // Compression is enabled if files compacted earlier already reached
-  // size ratio of compression.
-  bool enable_compression = true;
-  int ratio_to_compress =
-      options_->compaction_options_universal.compression_size_percent;
-  if (ratio_to_compress >= 0) {
-    uint64_t total_size = TotalFileSize(version->files_[level]);
-    uint64_t older_file_size = 0;
-    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
-        i--) {
-      older_file_size += version->files_[level][file_by_time[i]]->file_size;
-      if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
-        enable_compression = false;
-        break;
-      }
-    }
-  }
-  Compaction* c = new Compaction(level,
-                                 level,
-                                 MaxFileSizeForLevel(level),
-                                 LLONG_MAX,
-                                 NumberLevels(),
-                                 version,
-                                 false,
-                                 enable_compression);
-  c->score_ = score;
-
-  for (unsigned int i = start_index; i < first_index_after; i++) {
-    int index = file_by_time[i];
-    FileMetaData* f = version->files_[level][index];
-    c->inputs_[0].push_back(f);
-    Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
-        (unsigned long)f->number,
-        i,
-        (unsigned long)f->file_size);
-  }
-  return c;
-}
-
-//
-// Universal style of compaction. Pick files that are contiguous in
-// time-range to compact.
-//
-Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
-  assert (level == 0);
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-
-  if ((version->files_[level].size() <
-      (unsigned int)options_->level0_file_num_compaction_trigger)) {
-    Log(options_->info_log, "Universal: nothing to do\n");
-    return nullptr;
-  }
-  VersionSet::FileSummaryStorage tmp;
-  Log(options_->info_log,
-      "Universal: candidate files(%lu): %s\n",
-      version->files_[level].size(),
-      LevelFileSummary(version, &tmp, 0));
-
-  // Check for size amplification first.
-  Compaction* c = PickCompactionUniversalSizeAmp(level, score);
-  if (c == nullptr) {
-
-    // Size amplification is within limits. Try reducing read
-    // amplification while maintaining file size ratios.
-    unsigned int ratio = options_->compaction_options_universal.size_ratio;
-    c = PickCompactionUniversalReadAmp(level, score, ratio, UINT_MAX);
-
-    // Size amplification and file size ratios are within configured limits.
-    // If max read amplification is exceeding configured limits, then force
-    // compaction without looking at filesize ratios and try to reduce
-    // the number of files to fewer than level0_file_num_compaction_trigger.
-    if (c == nullptr) {
-      unsigned int num_files = version->files_[level].size() -
-                               options_->level0_file_num_compaction_trigger;
-      c = PickCompactionUniversalReadAmp(level, score, UINT_MAX, num_files);
-    }
-  }
-  if (c == nullptr) {
-    return nullptr;
-  }
-  assert(c->inputs_[0].size() > 1);
-
-  // validate that all the chosen files are non overlapping in time
-  FileMetaData* newerfile __attribute__((unused)) = nullptr;
-  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
-    FileMetaData* f = c->inputs_[0][i];
-    assert (f->smallest_seqno <= f->largest_seqno);
-    assert(newerfile == nullptr ||
-           newerfile->smallest_seqno > f->largest_seqno);
-    newerfile = f;
-  }
-
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = version->files_by_size_[level];
-
-  // Is the earliest file part of this compaction?
-  int last_index = file_by_time[file_by_time.size()-1];
-  FileMetaData* last_file = version->files_[level][last_index];
-  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
-    c->bottommost_level_ = true;
-  }
-
-  // update statistics
-  if (options_->statistics != nullptr) {
-    options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION,
-                                      c->inputs_[0].size());
-  }
-
-  // mark all the files that are being compacted
-  c->MarkFilesBeingCompacted(true);
-
-  // remember this currently undergoing compaction
-  compactions_in_progress_[level].insert(c);
-
-  // Record whether this compaction includes all sst files.
-  // For now, it is only relevant in universal compaction mode.
-  c->is_full_compaction_ = (c->inputs_[0].size() == version->files_[0].size());
-
-  return c;
-}
-
-Compaction* VersionSet::PickCompactionBySize(int level, double score) {
-  Compaction* c = nullptr;
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-
-  // level 0 files are overlapping. So we cannot pick more
-  // than one concurrent compactions at this level. This
-  // could be made better by looking at key-ranges that are
-  // being compacted at level 0.
-  if (level == 0 && compactions_in_progress_[level].size() == 1) {
-    return nullptr;
-  }
-
-  assert(level >= 0);
-  assert(level+1 < NumberLevels());
-  c = new Compaction(level,
-                     level + 1,
-                     MaxFileSizeForLevel(level + 1),
-                     MaxGrandParentOverlapBytes(level),
-                     NumberLevels(),
-                     version);
-  c->score_ = score;
-
-  // Pick the largest file in this level that is not already
-  // being compacted
-  std::vector<int>& file_size = version->files_by_size_[level];
-
-  // record the first file that is not yet compacted
-  int nextIndex = -1;
-
-  for (unsigned int i = version->next_file_to_compact_by_size_[level];
-       i < file_size.size(); i++) {
-    int index = file_size[i];
-    FileMetaData* f = version->files_[level][index];
-
-    // check to verify files are arranged in descending size
-    assert((i == file_size.size() - 1) ||
-           (i >= Version::number_of_files_to_sort_-1) ||
-          (f->file_size >= version->files_[level][file_size[i+1]]->file_size));
-
-    // do not pick a file to compact if it is being compacted
-    // from n-1 level.
-    if (f->being_compacted) {
-      continue;
-    }
-
-    // remember the startIndex for the next call to PickCompaction
-    if (nextIndex == -1) {
-      nextIndex = i;
-    }
-
-    //if (i > Version::number_of_files_to_sort_) {
-    //  Log(options_->info_log, "XXX Looking at index %d", i);
-    //}
-
-    // Do not pick this file if its parents at level+1 are being compacted.
-    // Maybe we can avoid redoing this work in SetupOtherInputs
-    int parent_index = -1;
-    if (ParentRangeInCompaction(&f->smallest, &f->largest, level,
-                                &parent_index)) {
-      continue;
-    }
-    c->inputs_[0].push_back(f);
-    c->base_index_ = index;
-    c->parent_index_ = parent_index;
-    break;
-  }
-
-  if (c->inputs_[0].empty()) {
-    delete c;
-    c = nullptr;
-  }
-
-  // store where to start the iteration in the next call to PickCompaction
-  version->next_file_to_compact_by_size_[level] = nextIndex;
-
-  return c;
-}
-
-Compaction* VersionSet::PickCompaction() {
-  Compaction* c = nullptr;
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-  int level = -1;
-
-  // Compute the compactions needed. It is better to do it here
-  // and also in LogAndApply(), otherwise the values could be stale.
-  std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
-  version->vset_->SizeBeingCompacted(size_being_compacted);
-  Finalize(version, size_being_compacted);
-
-  // In universal style of compaction, compact L0 files back into L0.
-  if (options_->compaction_style ==  kCompactionStyleUniversal) {
-    int level = 0;
-    c = PickCompactionUniversal(level, version->compaction_score_[level]);
-    return c;
-  }
-
-  // We prefer compactions triggered by too much data in a level over
-  // the compactions triggered by seeks.
-  //
-  // Find the compactions by size on all levels.
-  for (int i = 0; i < NumberLevels()-1; i++) {
-    assert(i == 0 || version->compaction_score_[i] <=
-                     version->compaction_score_[i-1]);
-    level = version->compaction_level_[i];
-    if ((version->compaction_score_[i] >= 1)) {
-      c = PickCompactionBySize(level, version->compaction_score_[i]);
-      ExpandWhileOverlapping(c);
-      if (c != nullptr) {
-        break;
-      }
-    }
-  }
-
-  // Find compactions needed by seeks
-  FileMetaData* f = version->file_to_compact_;
-  if (c == nullptr && f != nullptr && !f->being_compacted) {
-
-    level = version->file_to_compact_level_;
-    int parent_index = -1;
-
-    // Only allow one level 0 compaction at a time.
-    // Do not pick this file if its parents at level+1 are being compacted.
-    if (level != 0 || compactions_in_progress_[0].empty()) {
-      if(!ParentRangeInCompaction(&f->smallest, &f->largest, level,
-                                  &parent_index)) {
-        c = new Compaction(level,
-                           level + 1,
-                           MaxFileSizeForLevel(level + 1),
-                           MaxGrandParentOverlapBytes(level),
-                           NumberLevels(),
-                           version,
-                           true);
-        c->inputs_[0].push_back(f);
-        c->parent_index_ = parent_index;
-        version->file_to_compact_ = nullptr;
-        ExpandWhileOverlapping(c);
-      }
-    }
-  }
-
-  if (c == nullptr) {
-    return nullptr;
-  }
-
-  // Two level 0 compaction won't run at the same time, so don't need to worry
-  // about files on level 0 being compacted.
-  if (level == 0) {
-    assert(compactions_in_progress_[0].empty());
-    InternalKey smallest, largest;
-    GetRange(c->inputs_[0], &smallest, &largest);
-    // Note that the next call will discard the file we placed in
-    // c->inputs_[0] earlier and replace it with an overlapping set
-    // which will include the picked file.
-    c->inputs_[0].clear();
-    version->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
-
-    // If we include more L0 files in the same compaction run it can
-    // cause the 'smallest' and 'largest' key to get extended to a
-    // larger range. So, re-invoke GetRange to get the new key range
-    GetRange(c->inputs_[0], &smallest, &largest);
-    if (ParentRangeInCompaction(&smallest, &largest,
-                                level, &c->parent_index_)) {
-      delete c;
-      return nullptr;
-    }
-    assert(!c->inputs_[0].empty());
-  }
-
-  // Setup "level+1" files (inputs_[1])
-  SetupOtherInputs(c);
-
-  // mark all the files that are being compacted
-  c->MarkFilesBeingCompacted(true);
-
-  // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(false);
-
-  // remember this currently undergoing compaction
-  compactions_in_progress_[level].insert(c);
-
-  return c;
-}
-
-// Returns true if any one of the parent files are being compacted
-bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
-  const InternalKey* largest, int level, int* parent_index) {
-  std::vector<FileMetaData*> inputs;
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-
-  version->GetOverlappingInputs(
-      level + 1, smallest, largest, &inputs, *parent_index, parent_index);
-  return FilesInCompaction(inputs);
-}
-
-// Returns true if any one of specified files are being compacted
-bool VersionSet::FilesInCompaction(std::vector<FileMetaData*>& files) {
-  for (unsigned int i = 0; i < files.size(); i++) {
-    if (files[i]->being_compacted) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Add more files to the inputs on "level" to make sure that
-// no newer version of a key is compacted to "level+1" while leaving an older
-// version in a "level". Otherwise, any Get() will search "level" first,
-// and will likely return an old/stale value for the key, since it always
-// searches in increasing order of level to find the value. This could
-// also scramble the order of merge operands. This function should be
-// called any time a new Compaction is created, and its inputs_[0] are
-// populated.
-//
-// Will set c to nullptr if it is impossible to apply this compaction.
-void VersionSet::ExpandWhileOverlapping(Compaction* c) {
-  // If inputs are empty then there is nothing to expand.
-  if (!c || c->inputs_[0].empty()) {
-    return;
-  }
-
-  // GetOverlappingInputs will always do the right thing for level-0.
-  // So we don't need to do any expansion if level == 0.
-  if (c->level() == 0) {
-    return;
-  }
-
-  const int level = c->level();
-  InternalKey smallest, largest;
-
-  // Keep expanding c->inputs_[0] until we are sure that there is a
-  // "clean cut" boundary between the files in input and the surrounding files.
-  // This will ensure that no parts of a key are lost during compaction.
-  int hint_index = -1;
-  size_t old_size;
-  do {
-    old_size = c->inputs_[0].size();
-    GetRange(c->inputs_[0], &smallest, &largest);
-    c->inputs_[0].clear();
-    c->input_version_->GetOverlappingInputs(
-        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
-  } while(c->inputs_[0].size() > old_size);
-
-  // Get the new range
-  GetRange(c->inputs_[0], &smallest, &largest);
-
-  // If, after the expansion, there are files that are already under
-  // compaction, then we must drop/cancel this compaction.
-  int parent_index = -1;
-  if (FilesInCompaction(c->inputs_[0]) ||
-      ParentRangeInCompaction(&smallest, &largest, level, &parent_index)) {
-    c->inputs_[0].clear();
-    c->inputs_[1].clear();
-    delete c;
-    c = nullptr;
-  }
-}
-
-// Populates the set of inputs from "level+1" that overlap with "level".
-// Will also attempt to expand "level" if that doesn't expand "level+1"
-// or cause "level" to include a file for compaction that has an overlapping
-// user-key with another file.
-void VersionSet::SetupOtherInputs(Compaction* c) {
-  // If inputs are empty, then there is nothing to expand.
-  if (c->inputs_[0].empty()) {
-    return;
-  }
-
-  const int level = c->level();
-  InternalKey smallest, largest;
-
-  // Get the range one last time.
-  GetRange(c->inputs_[0], &smallest, &largest);
-
-  // Populate the set of next-level files (inputs_[1]) to include in compaction
-  c->input_version_->GetOverlappingInputs(level + 1,
-                                          &smallest,
-                                          &largest,
-                                          &c->inputs_[1],
-                                          c->parent_index_,
-                                          &c->parent_index_);
-
-  // Get entire range covered by compaction
-  InternalKey all_start, all_limit;
-  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
-
-  // See if we can further grow the number of inputs in "level" without
-  // changing the number of "level+1" files we pick up. We also choose NOT
-  // to expand if this would cause "level" to include some entries for some
-  // user key, while excluding other entries for the same user key. This
-  // can happen when one user key spans multiple files.
-  if (!c->inputs_[1].empty()) {
-    std::vector<FileMetaData*> expanded0;
-    c->input_version_->GetOverlappingInputs(
-        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
-    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
-    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
-    const uint64_t expanded0_size = TotalFileSize(expanded0);
-    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
-    if (expanded0.size() > c->inputs_[0].size() &&
-        inputs1_size + expanded0_size < limit &&
-        !FilesInCompaction(expanded0) &&
-        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
-      InternalKey new_start, new_limit;
-      GetRange(expanded0, &new_start, &new_limit);
-      std::vector<FileMetaData*> expanded1;
-      c->input_version_->GetOverlappingInputs(level + 1,
-                                              &new_start,
-                                              &new_limit,
-                                              &expanded1,
-                                              c->parent_index_,
-                                              &c->parent_index_);
-      if (expanded1.size() == c->inputs_[1].size() &&
-          !FilesInCompaction(expanded1)) {
-        Log(options_->info_log,
-            "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
-            "\n",
-            (unsigned long)level,
-            (unsigned long)(c->inputs_[0].size()),
-            (unsigned long)(c->inputs_[1].size()),
-            (unsigned long)inputs0_size,
-            (unsigned long)inputs1_size,
-            (unsigned long)(expanded0.size()),
-            (unsigned long)(expanded1.size()),
-            (unsigned long)expanded0_size,
-            (unsigned long)inputs1_size);
-        smallest = new_start;
-        largest = new_limit;
-        c->inputs_[0] = expanded0;
-        c->inputs_[1] = expanded1;
-        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
-      }
-    }
-  }
-
-  // Compute the set of grandparent files that overlap this compaction
-  // (parent == level+1; grandparent == level+2)
-  if (level + 2 < NumberLevels()) {
-    c->input_version_->GetOverlappingInputs(
-        level + 2, &all_start, &all_limit, &c->grandparents_);
-  }
-
-  if (false) {
-    Log(options_->info_log, "Compacting %d '%s' .. '%s'",
-        level,
-        smallest.DebugString().c_str(),
-        largest.DebugString().c_str());
-  }
-
-  // Update the place where we will do the next compaction for this level.
-  // We update this immediately instead of waiting for the VersionEdit
-  // to be applied so that if the compaction fails, we will try a different
-  // key range next time.
-  compact_pointer_[level] = largest.Encode().ToString();
-  c->edit_->SetCompactPointer(level, largest);
+  compaction_picker_->ReleaseCompactionFiles(c, status);
 }
 
-Status VersionSet::GetMetadataForFile(
-    uint64_t number,
-    int *filelevel,
-    FileMetaData *meta) {
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData* meta) {
   for (auto cfd : column_family_data_) {
-    for (int level = 0; level < NumberLevels(); level++) {
-      const std::vector<FileMetaData*>& files =
-          cfd.second->current->files_[level];
+    Version* version = cfd.second->current;
+    for (int level = 0; level < version->NumberLevels(); level++) {
+      const std::vector<FileMetaData*>& files = version->files_[level];
       for (size_t i = 0; i < files.size(); i++) {
         if (files[i]->number == number) {
           *meta = *files[i];
@@ -3064,261 +2233,4 @@ void VersionSet::DropColumnFamily(VersionEdit* edit) {
   column_family_data_.erase(cfd);
 }
 
-Compaction* VersionSet::CompactRange(int level,
-                                     const InternalKey* begin,
-                                     const InternalKey* end) {
-  std::vector<FileMetaData*> inputs;
-  // TODO this only works for default column family now
-  Version* version = column_family_data_.find(0)->second->current;
-
-  // All files are 'overlapping' in universal style compaction.
-  // We have to compact the entire range in one shot.
-  if (options_->compaction_style == kCompactionStyleUniversal) {
-    begin = nullptr;
-    end = nullptr;
-  }
-  version->GetOverlappingInputs(level, begin, end, &inputs);
-  if (inputs.empty()) {
-    return nullptr;
-  }
-
-  // Avoid compacting too much in one shot in case the range is large.
-  // But we cannot do this for level-0 since level-0 files can overlap
-  // and we must not pick one file and drop another older file if the
-  // two files overlap.
-  if (level > 0) {
-    const uint64_t limit = MaxFileSizeForLevel(level) *
-                         options_->source_compaction_factor;
-    uint64_t total = 0;
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      uint64_t s = inputs[i]->file_size;
-      total += s;
-      if (total >= limit) {
-        inputs.resize(i + 1);
-        break;
-      }
-    }
-  }
-  int out_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  level : level+1;
-
-  Compaction* c = new Compaction(level,
-                                 out_level,
-                                 MaxFileSizeForLevel(out_level),
-                                 MaxGrandParentOverlapBytes(level),
-                                 NumberLevels(),
-                                 version);
-
-  c->inputs_[0] = inputs;
-  ExpandWhileOverlapping(c);
-  if (c == nullptr) {
-    Log(options_->info_log, "Could not compact due to expansion failure.\n");
-    return nullptr;
-  }
-
-  SetupOtherInputs(c);
-
-  // These files that are to be manaully compacted do not trample
-  // upon other files because manual compactions are processed when
-  // the system has a max of 1 background compaction thread.
-  c->MarkFilesBeingCompacted(true);
-
-  // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(true);
-  return c;
-}
-
-Compaction::Compaction(int level,
-                       int out_level,
-                       uint64_t target_file_size,
-                       uint64_t max_grandparent_overlap_bytes,
-                       int number_levels,
-                       Version* input_version,
-                       bool seek_compaction,
-                       bool enable_compression)
-    : level_(level),
-      out_level_(out_level),
-      max_output_file_size_(target_file_size),
-      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
-      input_version_(input_version),
-      number_levels_(number_levels),
-      seek_compaction_(seek_compaction),
-      enable_compression_(enable_compression),
-      grandparent_index_(0),
-      seen_key_(false),
-      overlapped_bytes_(0),
-      base_index_(-1),
-      parent_index_(-1),
-      score_(0),
-      bottommost_level_(false),
-      is_full_compaction_(false),
-      level_ptrs_(std::vector<size_t>(number_levels)) {
-  input_version_->Ref();
-  edit_ = new VersionEdit(number_levels_);
-  for (int i = 0; i < number_levels_; i++) {
-    level_ptrs_[i] = 0;
-  }
-}
-
-Compaction::~Compaction() {
-  delete edit_;
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-  }
-}
-
-bool Compaction::IsTrivialMove() const {
-  // Avoid a move if there is lots of overlapping grandparent data.
-  // Otherwise, the move could create a parent file that will require
-  // a very expensive merge later on.
-  return (num_input_files(0) == 1 &&
-          num_input_files(1) == 0 &&
-          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
-}
-
-void Compaction::AddInputDeletions(VersionEdit* edit) {
-  for (int which = 0; which < 2; which++) {
-    for (size_t i = 0; i < inputs_[which].size(); i++) {
-      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
-    }
-  }
-}
-
-bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
-  if (input_version_->vset_->options_->compaction_style ==
-      kCompactionStyleUniversal) {
-    return bottommost_level_;
-  }
-  // Maybe use binary search to find right entry instead of linear search?
-  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
-  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
-    for (; level_ptrs_[lvl] < files.size(); ) {
-      FileMetaData* f = files[level_ptrs_[lvl]];
-      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
-        // We've advanced far enough
-        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
-          // Key falls in this file's range, so definitely not base level
-          return false;
-        }
-        break;
-      }
-      level_ptrs_[lvl]++;
-    }
-  }
-  return true;
-}
-
-bool Compaction::ShouldStopBefore(const Slice& internal_key) {
-  // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
-  while (grandparent_index_ < grandparents_.size() &&
-      icmp->Compare(internal_key,
-                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
-    if (seen_key_) {
-      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
-    }
-    assert(grandparent_index_ + 1 >= grandparents_.size() ||
-           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
-                         grandparents_[grandparent_index_+1]->smallest.Encode())
-                         < 0);
-    grandparent_index_++;
-  }
-  seen_key_ = true;
-
-  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
-    // Too much overlap for current output; start new output
-    overlapped_bytes_ = 0;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// Mark (or clear) each file that is being compacted
-void Compaction::MarkFilesBeingCompacted(bool value) {
-  for (int i = 0; i < 2; i++) {
-    std::vector<FileMetaData*> v = inputs_[i];
-    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
-      assert(value ? !inputs_[i][j]->being_compacted :
-                      inputs_[i][j]->being_compacted);
-      inputs_[i][j]->being_compacted = value;
-    }
-  }
-}
-
-// Is this compaction producing files at the bottommost level?
-void Compaction::SetupBottomMostLevel(bool isManual) {
-  if (input_version_->vset_->options_->compaction_style  ==
-         kCompactionStyleUniversal) {
-    // If universal compaction style is used and manual
-    // compaction is occuring, then we are guaranteed that
-    // all files will be picked in a single compaction
-    // run. We can safely set bottommost_level_ = true.
-    // If it is not manual compaction, then bottommost_level_
-    // is already set when the Compaction was created.
-    if (isManual) {
-      bottommost_level_ = true;
-    }
-    return;
-  }
-  bottommost_level_ = true;
-  int num_levels = input_version_->vset_->NumberLevels();
-  for (int i = level() + 2; i < num_levels; i++) {
-    if (input_version_->vset_->NumLevelFiles(i) > 0) {
-      bottommost_level_ = false;
-      break;
-    }
-  }
-}
-
-void Compaction::ReleaseInputs() {
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-    input_version_ = nullptr;
-  }
-}
-
-void Compaction::ResetNextCompactionIndex() {
-  input_version_->ResetNextCompactionIndex(level_);
-}
-
-static void InputSummary(std::vector<FileMetaData*>& files,
-    char* output,
-    int len) {
-  int write = 0;
-  for (unsigned int i = 0; i < files.size(); i++) {
-    int sz = len - write;
-    int ret = snprintf(output + write, sz, "%lu(%lu) ",
-        (unsigned long)files.at(i)->number,
-        (unsigned long)files.at(i)->file_size);
-    if (ret < 0 || ret >= sz)
-      break;
-    write += ret;
-  }
-}
-
-void Compaction::Summary(char* output, int len) {
-  int write = snprintf(output, len,
-      "Base version %lu Base level %d, seek compaction:%d, inputs:",
-      (unsigned long)input_version_->GetVersionNumber(),
-      level_,
-      seek_compaction_);
-  if (write < 0 || write > len) {
-    return;
-  }
-
-  char level_low_summary[100];
-  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
-  char level_up_summary[100];
-  if (inputs_[1].size()) {
-    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
-  } else {
-    level_up_summary[0] = '\0';
-  }
-
-  snprintf(output + write, len - write, "[%s],[%s]",
-      level_low_summary, level_up_summary);
-}
-
 }  // namespace rocksdb
diff --git a/db/version_set.h b/db/version_set.h
index b89083613..65a1406aa 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -27,12 +27,15 @@
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
+#include "db/compaction_picker.h"
 
 namespace rocksdb {
 
 namespace log { class Writer; }
 
 class Compaction;
+class CompactionPicker;
 class Iterator;
 class MemTable;
 class TableCache;
@@ -86,6 +89,11 @@ class Version {
   // REQUIRES: lock is held
   bool UpdateStats(const GetStats& stats);
 
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // Also pre-sorts level0 files for Get()
+  void Finalize(std::vector<uint64_t>& size_being_compacted);
+
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
   void Ref();
@@ -135,21 +143,54 @@ class Version {
   int PickLevelForMemTableOutput(const Slice& smallest_user_key,
                                  const Slice& largest_user_key);
 
-  int NumFiles(int level) const { return files_[level].size(); }
+  int NumberLevels() const { return num_levels_; }
+
+  // REQUIRES: lock is held
+  int NumLevelFiles(int level) const { return files_[level].size(); }
+
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::set<uint64_t>* live);
 
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
 
   // Returns the version nuber of this version
-  uint64_t GetVersionNumber() {
-    return version_number_;
-  }
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // used to sort files by size
+  struct Fsize {
+    int index;
+    FileMetaData* file;
+  };
 
  private:
   friend class Compaction;
   friend class VersionSet;
   friend class DBImpl;
   friend struct ColumnFamilyData;
+  friend class CompactionPicker;
+  friend class LevelCompactionPicker;
+  friend class UniversalCompactionPicker;
 
   class LevelFileNumIterator;
   Iterator* NewConcatenatingIterator(const ReadOptions&,
@@ -158,10 +199,15 @@ class Version {
   bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
                       const Slice& internal_prefix, Iterator* level_iter) const;
 
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
+
   VersionSet* vset_;            // VersionSet to which this Version belongs
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
+  int num_levels_;              // Number of levels
 
   // List of files per level, files in each level are arranged
   // in increasing order of keys
@@ -251,10 +297,8 @@ struct ColumnFamilyData {
 
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname,
-             const Options* options,
-             const EnvOptions& storage_options,
-             TableCache* table_cache,
+  VersionSet(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, TableCache* table_cache,
              const InternalKeyComparator*);
   ~VersionSet();
 
@@ -292,6 +336,12 @@ class VersionSet {
     return column_family_data_.find(0)->second->current;
   }
 
+  // A Flag indicating whether write needs to slowdown because of there are
+  // too many number of level0 files.
+  bool NeedSlowdownForNumLevel0Files() const {
+    return need_slowdown_for_num_level0_files_;
+  }
+
   // Return the current manifest file number
   uint64_t ManifestFileNumber() const { return manifest_file_number_; }
 
@@ -307,12 +357,6 @@ class VersionSet {
     }
   }
 
-  // Return the number of Table files at the specified level.
-  int NumLevelFiles(int level) const;
-
-  // Return the combined file size of all files at the specified level.
-  int64_t NumLevelBytes(int level) const;
-
   // Return the last sequence number.
   uint64_t LastSequence() const {
     return last_sequence_.load(std::memory_order_acquire);
@@ -346,14 +390,18 @@ class VersionSet {
   // the specified level.  Returns nullptr if there is nothing in that
   // level that overlaps the specified range.  Caller should delete
   // the result.
-  Compaction* CompactRange(
-      int level,
-      const InternalKey* begin,
-      const InternalKey* end);
-
-  // Return the maximum overlapping data (in bytes) at next level for any
-  // file at a level >= 1.
-  int64_t MaxNextLevelOverlappingBytes();
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(int input_level,
+                           int output_level,
+                           const InternalKey* begin,
+                           const InternalKey* end,
+                           InternalKey** compaction_end);
 
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
@@ -405,58 +453,16 @@ class VersionSet {
   // Add all files listed in any live version to *live.
   void AddLiveFiles(std::vector<uint64_t>* live_list);
 
-  // Add all files listed in the current version to *live.
-  void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
-
   // Return the approximate offset in the database of the data for
   // "key" as of version "v".
   uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
 
-  // Return a human-readable short (single-line) summary of the number
-  // of files per level.  Uses *scratch as backing store.
-  struct LevelSummaryStorage {
-    char buffer[100];
-  };
-  struct FileSummaryStorage {
-    char buffer[1000];
-  };
-  const char* LevelSummary(LevelSummaryStorage* scratch) const;
-
   // printf contents (for debugging)
   Status DumpManifest(Options& options, std::string& manifestFileName,
                       bool verbose, bool hex = false);
 
-  // Return a human-readable short (single-line) summary of the data size
-  // of files per level.  Uses *scratch as backing store.
-  const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
-
-  // Return a human-readable short (single-line) summary of files
-  // in a specified level.  Uses *scratch as backing store.
-  const char* LevelFileSummary(Version* version,
-                               FileSummaryStorage* scratch,
-                               int level) const;
-
   // Return the size of the current manifest file
-  const uint64_t ManifestFileSize() { return manifest_file_size_; }
-
-  // For the specfied level, pick a compaction.
-  // Returns nullptr if there is no compaction to be done.
-  // If level is 0 and there is already a compaction on that level, this
-  // function will return nullptr.
-  Compaction* PickCompactionBySize(int level, double score);
-
-  // Pick files to compact in Universal mode
-  Compaction* PickCompactionUniversal(int level, double score);
-
-  // Pick Universal compaction to limit read amplification
-  Compaction* PickCompactionUniversalReadAmp(int level, double score,
-                unsigned int ratio, unsigned int num_files);
-
-  // Pick Universal compaction to limit space amplification.
-  Compaction* PickCompactionUniversalSizeAmp(int level, double score);
-
-  // Free up the files that were participated in a compaction
-  void ReleaseCompactionFiles(Compaction* c, Status status);
+  uint64_t ManifestFileSize() const { return manifest_file_size_; }
 
   // verify that the files that we started with for a compaction
   // still exist in the current version and in the same original level.
@@ -464,20 +470,12 @@ class VersionSet {
   // pick the same files to compact.
   bool VerifyCompactionFileConsistency(Compaction* c);
 
-  // used to sort files by size
-  typedef struct fsize {
-    int index;
-    FileMetaData* file;
-  } Fsize;
-
-  // Sort all files for this version based on their file size and
-  // record results in files_by_size_. The largest files are listed first.
-  void UpdateFilesBySize(Version *v);
+  double MaxBytesForLevel(int level);
 
   // Get the max file size in a given level.
   uint64_t MaxFileSizeForLevel(int level);
 
-  double MaxBytesForLevel(int level);
+  void ReleaseCompactionFiles(Compaction* c, Status status);
 
   Status GetMetadataForFile(
     uint64_t number, int *filelevel, FileMetaData *metadata);
@@ -503,23 +501,6 @@ class VersionSet {
   friend class Compaction;
   friend class Version;
 
-  void Init(int num_levels);
-
-  void Finalize(Version* v, std::vector<uint64_t>&);
-
-  void GetRange(const std::vector<FileMetaData*>& inputs,
-                InternalKey* smallest,
-                InternalKey* largest);
-
-  void GetRange2(const std::vector<FileMetaData*>& inputs1,
-                 const std::vector<FileMetaData*>& inputs2,
-                 InternalKey* smallest,
-                 InternalKey* largest);
-
-  void ExpandWhileOverlapping(Compaction* c);
-
-  void SetupOtherInputs(Compaction* c);
-
   // Save current contents to *log
   Status WriteSnapshot(log::Writer* log);
 
@@ -527,10 +508,6 @@ class VersionSet {
 
   bool ManifestContains(const std::string& record) const;
 
-  uint64_t ExpandedCompactionByteSizeLimit(int level);
-
-  uint64_t MaxGrandParentOverlapBytes(int level);
-
   Env* const env_;
   const std::string dbname_;
   const Options* const options_;
@@ -547,18 +524,13 @@ class VersionSet {
   // Opened lazily
   unique_ptr<log::Writer> descriptor_log_;
 
-  // Per-level key at which the next compaction at that level should start.
-  // Either an empty string, or a valid InternalKey.
-  std::string* compact_pointer_;
-
-  // Per-level target file size.
-  uint64_t* max_file_size_;
+  // A flag indicating whether we should delay writes because
+  // we have too many level 0 files
+  bool need_slowdown_for_num_level0_files_;
 
-  // Per-level max bytes
-  uint64_t* level_max_bytes_;
-
-  // record all the ongoing compactions for all levels
-  std::vector<std::set<Compaction*> > compactions_in_progress_;
+  // An object that keeps all the compaction stats
+  // and picks the next compaction
+  std::unique_ptr<CompactionPicker> compaction_picker_;
 
   // generates a increasing version number for every new version
   uint64_t current_version_number_;
@@ -566,7 +538,7 @@ class VersionSet {
   // Queue of writers to the manifest file
   std::deque<ManifestWriter*> manifest_writers_;
 
-  // size of manifest file
+  // Current size of manifest file
   uint64_t manifest_file_size_;
 
   std::vector<FileMetaData*> obsolete_files_;
@@ -582,138 +554,8 @@ class VersionSet {
   VersionSet(const VersionSet&);
   void operator=(const VersionSet&);
 
-  // Return the total amount of data that is undergoing
-  // compactions per level
-  void SizeBeingCompacted(std::vector<uint64_t>&);
-
-  // Returns true if any one of the parent files are being compacted
-  bool ParentRangeInCompaction(const InternalKey* smallest,
-    const InternalKey* largest, int level, int* index);
-
-  // Returns true if any one of the specified files are being compacted
-  bool FilesInCompaction(std::vector<FileMetaData*>& files);
-
   void LogAndApplyHelper(Builder*b, Version* v,
                            VersionEdit* edit, port::Mutex* mu);
 };
 
-// A Compaction encapsulates information about a compaction.
-class Compaction {
- public:
-  ~Compaction();
-
-  // Return the level that is being compacted.  Inputs from "level"
-  // will be merged.
-  int level() const { return level_; }
-
-  // Outputs will go to this level
-  int output_level() const { return out_level_; }
-
-  // Return the object that holds the edits to the descriptor done
-  // by this compaction.
-  VersionEdit* edit() { return edit_; }
-
-  // "which" must be either 0 or 1
-  int num_input_files(int which) const { return inputs_[which].size(); }
-
-  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
-  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
-
-  // Maximum size of files to build during this compaction.
-  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
-
-  // Whether compression will be enabled for compaction outputs
-  bool enable_compression() const { return enable_compression_; }
-
-  // Is this a trivial compaction that can be implemented by just
-  // moving a single input file to the next level (no merging or splitting)
-  bool IsTrivialMove() const;
-
-  // Add all inputs to this compaction as delete operations to *edit.
-  void AddInputDeletions(VersionEdit* edit);
-
-  // Returns true if the information we have available guarantees that
-  // the compaction is producing data in "level+1" for which no data exists
-  // in levels greater than "level+1".
-  bool IsBaseLevelForKey(const Slice& user_key);
-
-  // Returns true iff we should stop building the current output
-  // before processing "internal_key".
-  bool ShouldStopBefore(const Slice& internal_key);
-
-  // Release the input version for the compaction, once the compaction
-  // is successful.
-  void ReleaseInputs();
-
-  void Summary(char* output, int len);
-
-  // Return the score that was used to pick this compaction run.
-  double score() const { return score_; }
-
-  // Is this compaction creating a file in the bottom most level?
-  bool BottomMostLevel() { return bottommost_level_; }
-
-  // Does this compaction include all sst files?
-  bool IsFullCompaction() { return is_full_compaction_; }
-
- private:
-  friend class Version;
-  friend class VersionSet;
-
-  Compaction(int level,
-             int out_level,
-             uint64_t target_file_size,
-             uint64_t max_grandparent_overlap_bytes,
-             int number_levels,
-             Version* input_version,
-             bool seek_compaction = false,
-             bool enable_compression = true);
-
-  int level_;
-  int out_level_; // levels to which output files are stored
-  uint64_t max_output_file_size_;
-  uint64_t maxGrandParentOverlapBytes_;
-  Version* input_version_;
-  VersionEdit* edit_;
-  int number_levels_;
-
-  bool seek_compaction_;
-  bool enable_compression_;
-
-  // Each compaction reads inputs from "level_" and "level_+1"
-  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
-
-  // State used to check for number of of overlapping grandparent files
-  // (parent == level_ + 1, grandparent == level_ + 2)
-  std::vector<FileMetaData*> grandparents_;
-  size_t grandparent_index_;  // Index in grandparent_starts_
-  bool seen_key_;             // Some output key has been seen
-  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
-                              // and grandparent files
-  int base_index_;   // index of the file in files_[level_]
-  int parent_index_; // index of some file with same range in files_[level_+1]
-  double score_;     // score that was used to pick this compaction.
-
-  // Is this compaction creating a file in the bottom most level?
-  bool bottommost_level_;
-  // Does this compaction include all sst files?
-  bool is_full_compaction_;
-
-  // level_ptrs_ holds indices into input_version_->levels_: our state
-  // is that we are positioned at one of the file ranges for each
-  // higher level than the ones involved in this compaction (i.e. for
-  // all L >= level_ + 2).
-  std::vector<size_t> level_ptrs_;
-
-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool);
-
-  // Initialize whether compaction producing files at the bottommost level
-  void SetupBottomMostLevel(bool isManual);
-
-  // In case of compaction error, reset the nextIndex that is used
-  // to pick up the next file to be compacted from files_by_size_
-  void ResetNextCompactionIndex();
-};
-
 }  // namespace rocksdb
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index 89653ea7c..e22b82a5a 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -26,7 +26,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
 
   // TODO this only works for default column family now
   Version* current_version = column_family_data_.find(0)->second->current;
-  int current_levels = NumberLevels();
+  int current_levels = current_version->NumberLevels();
 
   if (current_levels <= new_levels) {
     return Status::OK();
@@ -37,7 +37,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   int first_nonempty_level = -1;
   int first_nonempty_level_filenum = 0;
   for (int i = new_levels - 1; i < current_levels; i++) {
-    int file_num = NumLevelFiles(i);
+    int file_num = current_version->NumLevelFiles(i);
     if (file_num != 0) {
       if (first_nonempty_level < 0) {
         first_nonempty_level = i;
@@ -66,15 +66,12 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
 
   delete[] current_version->files_;
   current_version->files_ = new_files_list;
+  current_version->num_levels_ = new_levels;
 
-  delete[] compact_pointer_;
-  delete[] max_file_size_;
-  delete[] level_max_bytes_;
   num_levels_ = new_levels;
-  compact_pointer_ = new std::string[new_levels];
-  Init(new_levels);
-  VersionEdit ve(new_levels);
-  st = LogAndApply(&ve , mu, true);
+  compaction_picker_->ReduceNumberOfLevels(new_levels);
+  VersionEdit ve;
+  st = LogAndApply(&ve, mu, true);
   return st;
 }
 
diff --git a/db/write_batch.cc b/db/write_batch.cc
index eae0903c6..af4790ce5 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -31,7 +31,7 @@
 #include "db/snapshot.h"
 #include "db/write_batch_internal.h"
 #include "util/coding.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 #include <stdexcept>
 
 namespace rocksdb {
@@ -39,7 +39,8 @@ namespace rocksdb {
 // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
 static const size_t kHeader = 12;
 
-WriteBatch::WriteBatch() {
+WriteBatch::WriteBatch(size_t reserved_bytes) {
+  rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
   Clear();
 }
 
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 490a4401f..396e3ea6e 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -22,10 +22,11 @@ namespace rocksdb {
 static std::string PrintContents(WriteBatch* b) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
-  MemTable* mem = new MemTable(cmp, factory.get());
+  Options options;
+  options.memtable_factory = factory;
+  MemTable* mem = new MemTable(cmp, options);
   mem->Ref();
   std::string state;
-  Options options;
   Status s = WriteBatchInternal::InsertInto(b, mem, &options);
   int count = 0;
   Iterator* iter = mem->NewIterator();
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 086a18014..b60c96cbe 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -291,6 +291,7 @@ class DB {
   }
 
   // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
   // In particular, deleted and overwritten versions are discarded,
   // and the data is rearranged to reduce the cost of operations
   // needed to access the data.  This operation should typically only
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index fcb782d41..2fca8d161 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -111,27 +111,23 @@ class MemTableRep {
   };
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<Iterator> GetIterator() = 0;
+  virtual Iterator* GetIterator() = 0;
 
   // Return an iterator over at least the keys with the specified user key. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
-    return GetIterator();
-  }
+  virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); }
 
   // Return an iterator over at least the keys with the specified prefix. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
+  virtual Iterator* GetPrefixIterator(const Slice& prefix) {
     return GetIterator();
   }
 
   // Return an iterator that has a special Seek semantics. The result of
   // a Seek might only include keys with the same prefix as the target key.
-  virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
-    return GetIterator();
-  }
+  virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); }
 
  protected:
   // When *key is an internal key concatenated with the value, returns the
@@ -144,8 +140,8 @@ class MemTableRep {
 class MemTableRepFactory {
  public:
   virtual ~MemTableRepFactory() { };
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) = 0;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) = 0;
   virtual const char* Name() const = 0;
 };
 
@@ -161,8 +157,8 @@ class VectorRepFactory : public MemTableRepFactory {
   const size_t count_;
 public:
   explicit VectorRepFactory(size_t count = 0) : count_(count) { }
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) override;
   virtual const char* Name() const override {
     return "VectorRepFactory";
   }
@@ -171,8 +167,8 @@ public:
 // This uses a skip list to store keys. It is the default.
 class SkipListFactory : public MemTableRepFactory {
 public:
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+ virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                        Arena*) override;
   virtual const char* Name() const override {
     return "SkipListFactory";
   }
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 011e510f5..f5fbb5924 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -242,53 +242,10 @@ struct HistogramData {
   double standard_deviation;
 };
 
-
-class Histogram {
- public:
-  // clear's the histogram
-  virtual void Clear() = 0;
-  virtual ~Histogram();
-  // Add a value to be recorded in the histogram.
-  virtual void Add(uint64_t value) = 0;
-
-  virtual std::string ToString() const = 0;
-
-  // Get statistics
-  virtual double Median() const = 0;
-  virtual double Percentile(double p) const = 0;
-  virtual double Average() const = 0;
-  virtual double StandardDeviation() const = 0;
-  virtual void Data(HistogramData * const data) const = 0;
-
-};
-
-/**
- * A dumb ticker which keeps incrementing through its life time.
- * Thread safe. Locking managed by implementation of this interface.
- */
-class Ticker {
- public:
-  Ticker() : count_(0) { }
-
-  inline void setTickerCount(uint64_t count) {
-    count_ = count;
-  }
-
-  inline void recordTick(int count = 1) {
-    count_ += count;
-  }
-
-  inline uint64_t getCount() {
-    return count_;
-  }
-
- private:
-  std::atomic_uint_fast64_t count_;
-};
-
 // Analyze the performance of a db
 class Statistics {
  public:
+  virtual ~Statistics() {}
 
   virtual long getTickerCount(Tickers tickerType) = 0;
   virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index bc1d63ce4..a0072ce68 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -36,7 +36,7 @@ struct SliceParts;
 
 class WriteBatch {
  public:
-  WriteBatch();
+  explicit WriteBatch(size_t reserved_bytes = 0);
   ~WriteBatch();
 
   // Store the mapping "key->value" in the database.
@@ -122,7 +122,10 @@ class WriteBatch {
   Status Iterate(Handler* handler) const;
 
   // Retrieve the serialized version of this batch.
-  std::string Data() { return rep_; }
+  const std::string& Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
 
   // Returns the number of updates in the batch
   int Count() const;
diff --git a/table/table_test.cc b/table/table_test.cc
index 1f79fcdf9..9907550ce 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -12,7 +12,8 @@
 #include <vector>
 
 #include "db/dbformat.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
+#include "util/statistics.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/cache.h"
@@ -370,7 +371,9 @@ class MemTableConstructor: public Constructor {
       : Constructor(cmp),
         internal_comparator_(cmp),
         table_factory_(new SkipListFactory) {
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options options;
+    options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, options);
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -378,7 +381,9 @@ class MemTableConstructor: public Constructor {
   }
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
     delete memtable_->Unref();
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options memtable_options;
+    memtable_options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, memtable_options);
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -931,18 +936,12 @@ TEST(TableTest, NumBlockStat) {
 class BlockCacheProperties {
  public:
   explicit BlockCacheProperties(Statistics* statistics) {
-    block_cache_miss =
-      statistics->getTickerCount(BLOCK_CACHE_MISS);
-    block_cache_hit =
-      statistics->getTickerCount(BLOCK_CACHE_HIT);
-    index_block_cache_miss =
-      statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
-    index_block_cache_hit =
-      statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
-    data_block_cache_miss =
-      statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
-    data_block_cache_hit =
-      statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
+    block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
+    block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
+    index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
+    index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
+    data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
+    data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
   }
 
   // Check if the fetched props matches the expected ones.
@@ -1268,10 +1267,11 @@ class MemTableTest { };
 TEST(MemTableTest, Simple) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto table_factory = std::make_shared<SkipListFactory>();
-  MemTable* memtable = new MemTable(cmp, table_factory.get());
+  Options options;
+  options.memtable_factory = table_factory;
+  MemTable* memtable = new MemTable(cmp, options);
   memtable->Ref();
   WriteBatch batch;
-  Options options;
   WriteBatchInternal::SetSequence(&batch, 100);
   batch.Put(std::string("k1"), std::string("v1"));
   batch.Put(std::string("k2"), std::string("v2"));
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 966f007e8..8321c7eaf 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -26,7 +26,7 @@
 #include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "db/version_set.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/cache.h"
 #include "utilities/utility_db.h"
 #include "rocksdb/env.h"
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index c669769e0..e9fe1573a 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -31,17 +31,15 @@ class HashSkipListRep : public MemTableRep {
 
   virtual ~HashSkipListRep();
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
-      const Slice& slice) override;
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator(
-      const Slice& prefix) override;
-
-  virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
+  virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
       override;
 
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
  private:
   friend class DynamicIterator;
   typedef SkipList<const char*, MemTableRep::KeyComparator&> Bucket;
@@ -208,18 +206,15 @@ class HashSkipListRep : public MemTableRep {
     virtual void SeekToLast() { }
    private:
   };
-
-  std::shared_ptr<EmptyIterator> empty_iterator_;
 };
 
 HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
-    Arena* arena, const SliceTransform* transform, size_t bucket_size)
-  : bucket_size_(bucket_size),
-    transform_(transform),
-    compare_(compare),
-    arena_(arena),
-    empty_iterator_(std::make_shared<EmptyIterator>()) {
-
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size)
+    : bucket_size_(bucket_size),
+      transform_(transform),
+      compare_(compare),
+      arena_(arena) {
   buckets_ = new port::AtomicPointer[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
@@ -263,7 +258,7 @@ size_t HashSkipListRep::ApproximateMemoryUsage() {
   return sizeof(buckets_);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
+MemTableRep::Iterator* HashSkipListRep::GetIterator() {
   auto list = new Bucket(compare_, arena_);
   for (size_t i = 0; i < bucket_size_; ++i) {
     auto bucket = GetBucket(i);
@@ -274,35 +269,30 @@ std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
       }
     }
   }
-  return std::make_shared<Iterator>(list);
+  return new Iterator(list);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetPrefixIterator(
-  const Slice& prefix) {
+MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) {
   auto bucket = GetBucket(prefix);
   if (bucket == nullptr) {
-    return empty_iterator_;
+    return new EmptyIterator();
   }
-  return std::make_shared<Iterator>(bucket, false);
+  return new Iterator(bucket, false);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator(
-    const Slice& slice) {
+MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) {
   return GetPrefixIterator(transform_->Transform(slice));
 }
 
-std::shared_ptr<MemTableRep::Iterator>
-    HashSkipListRep::GetDynamicPrefixIterator() {
-  return std::make_shared<DynamicIterator>(*this);
+MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
 }
 
 } // anon namespace
 
-std::shared_ptr<MemTableRep>
-HashSkipListRepFactory::CreateMemTableRep(MemTableRep::KeyComparator &compare,
-                                          Arena *arena) {
-  return std::make_shared<HashSkipListRep>(compare, arena, transform_,
-      bucket_count_);
+MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new HashSkipListRep(compare, arena, transform_, bucket_count_);
 }
 
 MemTableRepFactory* NewHashSkipListRepFactory(
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
index b946cf05e..7b8414c88 100644
--- a/util/hash_skiplist_rep.h
+++ b/util/hash_skiplist_rep.h
@@ -21,8 +21,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
 
   virtual ~HashSkipListRepFactory() { delete transform_; }
 
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-      MemTableRep::KeyComparator& compare, Arena* arena) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
+                                         Arena* arena) override;
 
   virtual const char* Name() const override {
     return "HashSkipListRepFactory";
diff --git a/util/histogram.cc b/util/histogram.cc
index e83998014..968769cef 100644
--- a/util/histogram.cc
+++ b/util/histogram.cc
@@ -16,27 +16,38 @@
 
 namespace rocksdb {
 
-HistogramBucketMapper::HistogramBucketMapper() :
-  // Add newer bucket index here.
-  // Should be alwyas added in sorted order.
-  bucketValues_({
-  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45,
-  50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450,
-  500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000,
-  3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000,
-  16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000,
-  70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000,
-  250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000,
-  900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000,
-  3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000,
-  9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000,
-  25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000,
-  70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000,
-  180000000, 200000000, 250000000, 300000000, 350000000, 400000000,
-  450000000, 500000000, 600000000, 700000000, 800000000, 900000000,
-  1000000000}),
-  maxBucketValue_(bucketValues_.back()),
-  minBucketValue_(bucketValues_.front()) {
+HistogramBucketMapper::HistogramBucketMapper()
+    :
+      // Add newer bucket index here.
+      // Should be alwyas added in sorted order.
+      // If you change this, you also need to change
+      // size of array buckets_ in HistogramImpl
+      bucketValues_(
+          {1,         2,         3,         4,         5,         6,
+           7,         8,         9,         10,        12,        14,
+           16,        18,        20,        25,        30,        35,
+           40,        45,        50,        60,        70,        80,
+           90,        100,       120,       140,       160,       180,
+           200,       250,       300,       350,       400,       450,
+           500,       600,       700,       800,       900,       1000,
+           1200,      1400,      1600,      1800,      2000,      2500,
+           3000,      3500,      4000,      4500,      5000,      6000,
+           7000,      8000,      9000,      10000,     12000,     14000,
+           16000,     18000,     20000,     25000,     30000,     35000,
+           40000,     45000,     50000,     60000,     70000,     80000,
+           90000,     100000,    120000,    140000,    160000,    180000,
+           200000,    250000,    300000,    350000,    400000,    450000,
+           500000,    600000,    700000,    800000,    900000,    1000000,
+           1200000,   1400000,   1600000,   1800000,   2000000,   2500000,
+           3000000,   3500000,   4000000,   4500000,   5000000,   6000000,
+           7000000,   8000000,   9000000,   10000000,  12000000,  14000000,
+           16000000,  18000000,  20000000,  25000000,  30000000,  35000000,
+           40000000,  45000000,  50000000,  60000000,  70000000,  80000000,
+           90000000,  100000000, 120000000, 140000000, 160000000, 180000000,
+           200000000, 250000000, 300000000, 350000000, 400000000, 450000000,
+           500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}),
+      maxBucketValue_(bucketValues_.back()),
+      minBucketValue_(bucketValues_.front()) {
   for (size_t i =0; i < bucketValues_.size(); ++i) {
     valueIndexMap_[bucketValues_[i]] = i;
   }
@@ -62,24 +73,17 @@ namespace {
   const HistogramBucketMapper bucketMapper;
 }
 
-
-HistogramImpl::HistogramImpl() :
-  min_(bucketMapper.LastValue()),
-  max_(0),
-  num_(0),
-  sum_(0),
-  sum_squares_(0),
-  buckets_(std::vector<uint64_t>(bucketMapper.BucketCount(), 0)) {}
-
 void HistogramImpl::Clear() {
   min_ = bucketMapper.LastValue();
   max_ = 0;
   num_ = 0;
   sum_ = 0;
   sum_squares_ = 0;
-  buckets_.resize(bucketMapper.BucketCount(), 0);
+  memset(buckets_, 0, sizeof buckets_);
 }
 
+bool HistogramImpl::Empty() { return sum_squares_ == 0; }
+
 void HistogramImpl::Add(uint64_t value) {
   const size_t index = bucketMapper.IndexForValue(value);
   buckets_[index] += 1;
diff --git a/util/histogram.h b/util/histogram.h
index c01594da7..d95588dc2 100644
--- a/util/histogram.h
+++ b/util/histogram.h
@@ -52,9 +52,8 @@ class HistogramBucketMapper {
 
 class HistogramImpl {
  public:
-  HistogramImpl();
-  virtual ~HistogramImpl() {}
   virtual void Clear();
+  virtual bool Empty();
   virtual void Add(uint64_t value);
   void Merge(const HistogramImpl& other);
 
@@ -67,13 +66,14 @@ class HistogramImpl {
   virtual void Data(HistogramData * const data) const;
 
  private:
-  double min_;
-  double max_;
-  double num_;
-  double sum_;
-  double sum_squares_;
-  std::vector<uint64_t> buckets_;
-
+  // To be able to use HistogramImpl as thread local variable, its constructor
+  // has to be static. That's why we're using manually values from BucketMapper
+  double min_ = 1000000000;  // this is BucketMapper:LastValue()
+  double max_ = 0;
+  double num_ = 0;
+  double sum_ = 0;
+  double sum_squares_ = 0;
+  uint64_t buckets_[138] = {0};  // this is BucketMapper::BucketCount()
 };
 
 }  // namespace rocksdb
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 58d81460e..65ecd61a2 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1024,7 +1024,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   }
   int max = -1;
   for (int i = 0; i < versions.NumberLevels(); i++) {
-    if (versions.NumLevelFiles(i)) {
+    if (versions.current()->NumLevelFiles(i)) {
       max = i;
     }
   }
diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc
index ebe1339e5..dd615f057 100644
--- a/util/manual_compaction_test.cc
+++ b/util/manual_compaction_test.cc
@@ -9,9 +9,13 @@
 #include <cstdlib>
 
 #include "rocksdb/db.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "util/testharness.h"
 
+using namespace rocksdb;
+
 namespace {
 
 const int kNumKeys = 1100000;
@@ -26,12 +30,71 @@ std::string Key2(int i) {
   return Key1(i) + "_xxx";
 }
 
-class ManualCompactionTest { };
+class ManualCompactionTest {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
+    DestroyDB(dbname_, rocksdb::Options());
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const {
+    return existing_value.ToString() == "destroy";
+  }
+
+  virtual const char* Name() const {
+    return "DestroyAllCompactionFilter";
+  }
+};
+
+TEST(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) { // level compaction
+      options.num_levels = 3;
+      options.compaction_style = kCompactionStyleLevel;
+    } else { // universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = rocksdb::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+    Slice key4("key4");
+    db->CompactRange(nullptr, &key4);
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    DestroyDB(dbname_, options);
+  }
+}
 
 TEST(ManualCompactionTest, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
-  DestroyDB(dbpath, rocksdb::Options());
 
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
@@ -40,7 +103,7 @@ TEST(ManualCompactionTest, Test) {
   rocksdb::Options db_options;
   db_options.create_if_missing = true;
   db_options.compression = rocksdb::kNoCompression;
-  ASSERT_OK(rocksdb::DB::Open(db_options, dbpath, &db));
+  ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
 
   // create first key range
   rocksdb::WriteBatch batch;
@@ -83,7 +146,7 @@ TEST(ManualCompactionTest, Test) {
 
   // close database
   delete db;
-  DestroyDB(dbpath, rocksdb::Options());
+  DestroyDB(dbname_, rocksdb::Options());
 }
 
 }  // anonymous namespace
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index 955d754b1..a5b072ad1 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -90,15 +90,15 @@ public:
   // Unhide default implementations of GetIterator
   using MemTableRep::GetIterator;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override {
-    return std::make_shared<SkipListRep::Iterator>(&skip_list_);
+  virtual MemTableRep::Iterator* GetIterator() override {
+    return new SkipListRep::Iterator(&skip_list_);
   }
 };
 }
 
-std::shared_ptr<MemTableRep> SkipListFactory::CreateMemTableRep (
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-    return std::shared_ptr<MemTableRep>(new SkipListRep(compare, arena));
+MemTableRep* SkipListFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new SkipListRep(compare, arena);
 }
 
 } // namespace rocksdb
diff --git a/util/statistics.cc b/util/statistics.cc
index 5f7a5ba46..f19a777c1 100644
--- a/util/statistics.cc
+++ b/util/statistics.cc
@@ -3,12 +3,48 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#include "util/statistics.h"
 #include "rocksdb/statistics.h"
 #include <cstdio>
 
 namespace rocksdb {
 
+std::shared_ptr<Statistics> CreateDBStatistics() {
+  return std::make_shared<StatisticsImpl>();
+}
+
+StatisticsImpl::StatisticsImpl() {}
+
+StatisticsImpl::~StatisticsImpl() {}
+
+long StatisticsImpl::getTickerCount(Tickers tickerType) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  return tickers_[tickerType];
+}
+
+void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  tickers_[tickerType] = count;
+}
+
+void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  tickers_[tickerType] += count;
+}
+
+void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  histograms_[histogramType].Add(value);
+}
+
+void StatisticsImpl::histogramData(Histograms histogramType,
+                                   HistogramData* const data) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  histograms_[histogramType].Data(data);
+}
+
 namespace {
+
 // a buffer size used for temp string buffers
 const int kBufferSize = 200;
 
@@ -32,11 +68,8 @@ std::string HistogramToString (
   return std::string(buffer);
 };
 
-std::string TickerToString (
-    Statistics* dbstats,
-    const Tickers& ticker,
-    const std::string& name) {
-
+std::string TickerToString(Statistics* dbstats, const Tickers& ticker,
+                           const std::string& name) {
   char buffer[kBufferSize];
   snprintf(buffer, kBufferSize, "%s COUNT : %ld\n",
             name.c_str(), dbstats->getTickerCount(ticker));
diff --git a/util/statistics.h b/util/statistics.h
new file mode 100644
index 000000000..36456dddc
--- /dev/null
+++ b/util/statistics.h
@@ -0,0 +1,53 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/statistics.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+
+#define UNLIKELY(val) (__builtin_expect((val), 0))
+
+namespace rocksdb {
+
+class StatisticsImpl : public Statistics {
+ public:
+  StatisticsImpl();
+  virtual ~StatisticsImpl();
+
+  virtual long getTickerCount(Tickers tickerType);
+  virtual void setTickerCount(Tickers tickerType, uint64_t count);
+  virtual void recordTick(Tickers tickerType, uint64_t count);
+  virtual void measureTime(Histograms histogramType, uint64_t value);
+  virtual void histogramData(Histograms histogramType,
+                             HistogramData* const data);
+
+ private:
+  std::atomic_uint_fast64_t tickers_[TICKER_ENUM_MAX];
+  HistogramImpl histograms_[HISTOGRAM_ENUM_MAX];
+};
+
+// Utility functions
+inline void MeasureTime(Statistics* statistics, Histograms histogramType,
+                        uint64_t value) {
+  if (statistics) {
+    statistics->measureTime(histogramType, value);
+  }
+}
+
+inline void RecordTick(Statistics* statistics, Tickers ticker,
+                       uint64_t count = 1) {
+  if (statistics) {
+    statistics->recordTick(ticker, count);
+  }
+}
+
+inline void SetTickerCount(Statistics* statistics, Tickers ticker,
+                           uint64_t count) {
+  if (statistics) {
+    statistics->setTickerCount(ticker, count);
+  }
+}
+}
diff --git a/util/statistics_imp.h b/util/statistics_imp.h
deleted file mode 100644
index 0dc8884c1..000000000
--- a/util/statistics_imp.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-#pragma once
-#include "rocksdb/statistics.h"
-
-namespace rocksdb {
-
-// Utility functions
-inline void RecordTick(Statistics* statistics,
-                       Tickers ticker,
-                       uint64_t count = 1) {
-  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
-  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
-  if (statistics) {
-    statistics->recordTick(ticker, count);
-  }
-}
-
-inline void SetTickerCount(Statistics* statistics,
-                           Tickers ticker,
-                           uint64_t count) {
-  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
-  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
-  if (statistics) {
-    statistics->setTickerCount(ticker, count);
-  }
-}
-
-}
diff --git a/util/stop_watch.h b/util/stop_watch.h
index 6325a7440..48e1b01c2 100644
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -5,7 +5,7 @@
 //
 #pragma once
 #include "rocksdb/env.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 
 namespace rocksdb {
 // Auto-scoped.
@@ -28,11 +28,7 @@ class StopWatch {
     return env_->NowMicros() - start_time_;
   }
 
-  ~StopWatch() {
-    if (statistics_) {
-      statistics_->measureTime(histogram_name_, ElapsedMicros());
-    }
-  }
+  ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); }
 
  private:
   Env* const env_;
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index 8d3ccc9df..87fae4bc7 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -88,7 +88,7 @@ class VectorRep : public MemTableRep {
   using MemTableRep::GetIterator;
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
  private:
   friend class Iterator;
@@ -228,22 +228,22 @@ void VectorRep::Iterator::SeekToLast() {
   }
 }
 
-std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() {
+MemTableRep::Iterator* VectorRep::GetIterator() {
   ReadLock l(&rwlock_);
   // Do not sort here. The sorting would be done the first time
   // a Seek is performed on the iterator.
   if (immutable_) {
-    return std::make_shared<Iterator>(this, bucket_, compare_);
+    return new Iterator(this, bucket_, compare_);
   } else {
     std::shared_ptr<Bucket> tmp;
     tmp.reset(new Bucket(*bucket_)); // make a copy
-    return std::make_shared<Iterator>(nullptr, tmp, compare_);
+    return new Iterator(nullptr, tmp, compare_);
   }
 }
 } // anon namespace
 
-std::shared_ptr<MemTableRep> VectorRepFactory::CreateMemTableRep(
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-  return std::make_shared<VectorRep>(compare, arena, count_);
+MemTableRep* VectorRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new VectorRep(compare, arena, count_);
 }
 } // namespace rocksdb