From c4548d5f1f46645d996c7640d591397769ccbaf4 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 13 Jan 2014 15:01:34 -0800
Subject: [PATCH 01/27] WriteBatch to provide a way for user to query data size
 directly and only return constant reference of data in Data()

Summary:
WriteBatch::Data() now is easily to be misuse by users. Also, there is no cheap way for user of WriteBatch to know the data size accumulated. This patch fix the problem by:
(1) return a constant reference to Data() so it's obvious to caller what it means.
(2) add a function to return data size directly

Test Plan: make all check

Reviewers: haobo, igor, kailiu

Reviewed By: kailiu

CC: zshao, leveldb

Differential Revision: https://reviews.facebook.net/D15123
---
 include/rocksdb/write_batch.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 798807045..30abead50 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -88,7 +88,10 @@ class WriteBatch {
   Status Iterate(Handler* handler) const;
 
   // Retrieve the serialized version of this batch.
-  std::string Data() { return rep_; }
+  std::string Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
 
   // Returns the number of updates in the batch
   int Count() const;

From ac2fe728327be75c8c289d4e3ebf8587d88c518d Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Mon, 13 Jan 2014 22:09:41 -0800
Subject: [PATCH 02/27] Compile dynamic library by default

Summary:
Per request, some users need to use dynamic rocksdb library instead of static one.

However currently the dynamic libraries have to be manually compiled by default, which is inconvenient. I made dymamic libraries to be compiled by default.

Test Plan: make clean; make; make clean;

Reviewers: haobo, sdong, dhruba, igor

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15117
---
 Makefile                          | 6 +++---
 build_tools/build_detect_platform | 4 ++--
 build_tools/fbcode.gcc481.sh      | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index ff8347957..5170ac54a 100644
--- a/Makefile
+++ b/Makefile
@@ -127,12 +127,12 @@ $(SHARED2): $(SHARED3)
 	ln -fs $(SHARED3) $(SHARED2)
 endif
 
-$(SHARED3):
-	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS)
+$(SHARED3): $(LIBOBJECTS)
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@
 
 endif  # PLATFORM_SHARED_EXT
 
-all: $(LIBRARY) $(PROGRAMS)
+all: $(LIBRARY) $(PROGRAMS) $(SHARED)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
 	release tags valgrind_check whitebox_crash_test
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 87c4c871d..8e83ae497 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -81,9 +81,9 @@ PLATFORM_CCFLAGS=
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
 PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
 PLATFORM_SHARED_EXT="so"
-PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
-PLATFORM_SHARED_VERSIONED=true
+PLATFORM_SHARED_VERSIONED=false
 
 # generic port files (working on all platform by #ifdef) go directly in /port
 GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index ae2bb57da..e8c9f090b 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -60,7 +60,7 @@ AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
 RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
 
 CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
 CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
 CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
 

From 51dd21926c677ae4a63c8f45992903e7b30f0d13 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 14 Jan 2014 10:42:36 -0800
Subject: [PATCH 03/27] DB::Put() to estimate write batch data size needed and
 pre-allocate buffer

Summary:
In one of CPU profiles, we see some CPU costs of string::reserve() inside Batch.Put(). This patch should be able to reduce some of the costs by allocating sufficient buffer before hand.

Since it is a trivial percentage of CPU costs, I didn't find a way to show the improvement in one of the benchmarks. I'll deploy it to same application and do the same CPU profiling to make sure those CPU costs are reduced.

Test Plan: make all check

Reviewers: haobo, kailiu, igor

Reviewed By: haobo

CC: leveldb, nkg-

Differential Revision: https://reviews.facebook.net/D15135
---
 db/db_impl.cc                 | 5 ++++-
 db/write_batch.cc             | 3 ++-
 include/rocksdb/write_batch.h | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index e7f2abf99..12e07868f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3758,7 +3758,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) {
 // Default implementations of convenience methods that subclasses of DB
 // can call if they wish
 Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
-  WriteBatch batch;
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24);
   batch.Put(key, value);
   return Write(opt, &batch);
 }
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 2cfc8bd7d..7a6106afa 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -36,7 +36,8 @@ namespace rocksdb {
 // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
 static const size_t kHeader = 12;
 
-WriteBatch::WriteBatch() {
+WriteBatch::WriteBatch(size_t reserved_bytes) {
+  rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
   Clear();
 }
 
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 30abead50..e7ce16005 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -35,7 +35,7 @@ struct SliceParts;
 
 class WriteBatch {
  public:
-  WriteBatch();
+  explicit WriteBatch(size_t reserved_bytes = 0);
   ~WriteBatch();
 
   // Store the mapping "key->value" in the database.

From fbbf0d1456f8d872d100f7cbfceb9f9b89249664 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 14 Jan 2014 11:04:27 -0800
Subject: [PATCH 04/27] Pre-calculate whether to slow down for too many level 0
 files

Summary: Currently in DBImpl::MakeRoomForWrite(), we do  "versions_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger" to check whether the writer thread needs to slow down. However, versions_->NumLevelFiles(0) is slightly more expensive than we expected. By caching the result of the comparison when installing a new version, we can avoid this function call every time.

Test Plan:
make all check
Manually trigger this behavior by applying universal compaction style and make sure inserts are made slow after there are certain number of files.

Reviewers: haobo, kailiu, igor

Reviewed By: kailiu

CC: nkg-, leveldb

Differential Revision: https://reviews.facebook.net/D15141
---
 db/db_impl.cc     | 3 +--
 db/version_set.cc | 4 ++++
 db/version_set.h  | 8 ++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 12e07868f..ed5853336 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3166,8 +3166,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       break;
     } else if (
         allow_delay &&
-        versions_->NumLevelFiles(0) >=
-          options_.level0_slowdown_writes_trigger) {
+        versions_->NeedSlowdownForNumLevel0Files()) {
       // We are getting close to hitting a hard limit on the number of
       // L0 files.  Rather than delaying a single write by several
       // seconds when we hit the hard limit, start delaying each
diff --git a/db/version_set.cc b/db/version_set.cc
index 46cdfaa61..7a1f5cbf8 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1148,6 +1148,7 @@ VersionSet::VersionSet(const std::string& dbname,
       num_levels_(options_->num_levels),
       dummy_versions_(this),
       current_(nullptr),
+      need_slowdown_for_num_level0_files(false),
       compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
       last_observed_manifest_size_(0),
@@ -1199,6 +1200,9 @@ void VersionSet::AppendVersion(Version* v) {
     current_->Unref();
   }
   current_ = v;
+  need_slowdown_for_num_level0_files =
+      (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
+       NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
 
   // Append to linked list
diff --git a/db/version_set.h b/db/version_set.h
index 75b529942..85ff2ff36 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -250,6 +250,12 @@ class VersionSet {
   // Return the current version.
   Version* current() const { return current_; }
 
+  // A Flag indicating whether write needs to slowdown because of there are
+  // too many number of level0 files.
+  bool NeedSlowdownForNumLevel0Files() const {
+    return need_slowdown_for_num_level0_files;
+  }
+
   // Return the current manifest file number
   uint64_t ManifestFileNumber() const { return manifest_file_number_; }
 
@@ -489,6 +495,8 @@ class VersionSet {
   Version dummy_versions_;  // Head of circular doubly-linked list of versions.
   Version* current_;        // == dummy_versions_.prev_
 
+  bool need_slowdown_for_num_level0_files;
+
   // Per-level key at which the next compaction at that level should start.
   // Either an empty string, or a valid InternalKey.
   std::string* compact_pointer_;

From 1d9bac4d7f2e66f056f0ac21753f5c0e7379e1bf Mon Sep 17 00:00:00 2001
From: Naman Gupta <nkgupta@fb.com>
Date: Fri, 15 Nov 2013 17:17:13 -0800
Subject: [PATCH 05/27] Use sanitized options while opening db

Summary: We use SanitizeOptions() to set appropriate values for some options, based on other options. So we should use the sanitized options by default. Luckily it hasn't caused a bug yet, but can result in a bug in the fugture.

Test Plan: make check

Reviewers: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14103
---
 db/db_impl.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index ed5853336..b50eb4c44 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3808,13 +3808,13 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
     unique_ptr<WritableFile> lfile;
     soptions.use_mmap_writes = false;
-    s = options.env->NewWritableFile(
+    s = impl->options_.env->NewWritableFile(
       LogFileName(impl->options_.wal_dir, new_log_number),
       &lfile,
       soptions
     );
     if (s.ok()) {
-      lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size);
+      lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
       edit.SetLogNumber(new_log_number);
       impl->logfile_number_ = new_log_number;
       impl->log_.reset(new log::Writer(std::move(lfile)));
@@ -3830,7 +3830,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
   }
   impl->mutex_.Unlock();
 
-  if (options.compaction_style == kCompactionStyleUniversal) {
+  if (impl->options_.compaction_style == kCompactionStyleUniversal) {
     int num_files;
     for (int i = 1; i < impl->NumberLevels(); i++) {
       num_files = impl->versions_->NumLevelFiles(i);

From d702d8073e2572a19c806fa53e484a25863f6df4 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Tue, 14 Jan 2014 00:39:42 -0800
Subject: [PATCH 06/27] A script that automatically reformat affected lines

Summary:
Added a script that reformat only the affected lines in a given diff.

I planned to make that file as pre-commit hook but looks it's a little bit more difficult than I thought. Since I don't want to spend too much time on this task right now, I eventually added a "make command" to achieve this with a few additional key strokes.

Also make the clang-format solely inherited from Google's style -- there are still debates on some of the style issues, but we can address them later once we reach a consensus.

Test Plan: Did some ugly format change and ran "make format", all affected lines are formatted as expected.

Reviewers: igor, sdong, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15147
---
 .clang-format              | 42 -------------------
 Makefile                   | 11 ++++-
 build_tools/format-diff.sh | 83 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 43 deletions(-)
 create mode 100755 build_tools/format-diff.sh

diff --git a/.clang-format b/.clang-format
index a1e9a48e4..7c279811a 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,46 +2,4 @@
 # http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
 BasedOnStyle: Google
-AccessModifierOffset: -1
-ConstructorInitializerIndentWidth: 4
-AlignEscapedNewlinesLeft: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakTemplateDeclarations: true
-AlwaysBreakBeforeMultilineStrings: true
-BreakBeforeBinaryOperators: false
-BreakConstructorInitializersBeforeComma: false
-BinPackParameters: false
-ColumnLimit:     80
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-DerivePointerBinding: true
-ExperimentalAutoDetectBinPacking: true
-IndentCaseLabels: false
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 10
-PenaltyBreakComment: 60
-PenaltyBreakString: 1000
-PenaltyBreakFirstLessLess: 20
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerBindsToType: true
-SpacesBeforeTrailingComments: 2
-Cpp11BracedListStyle: true
-Standard:        Cpp11
-IndentWidth:     2
-TabWidth:        8
-UseTab:          Never
-BreakBeforeBraces: Attach
-IndentFunctionDeclarationAfterType: false
-SpacesInParentheses: false
-SpacesInAngles: false
-SpaceInEmptyParentheses: false
-SpacesInCStyleCastParentheses: false
-SpaceAfterControlStatementKeyword: true
-SpaceBeforeAssignmentOperators: true
-ContinuationIndentWidth: 4
 ...
diff --git a/Makefile b/Makefile
index 5170ac54a..ebf7b96fe 100644
--- a/Makefile
+++ b/Makefile
@@ -135,7 +135,7 @@ endif  # PLATFORM_SHARED_EXT
 all: $(LIBRARY) $(PROGRAMS) $(SHARED)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
-	release tags valgrind_check whitebox_crash_test
+	release tags valgrind_check whitebox_crash_test format
 
 release:
 	$(MAKE) clean
@@ -196,6 +196,9 @@ tags:
 	ctags * -R
 	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
 
+format:
+	build_tools/format-diff.sh
+
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
@@ -411,6 +414,12 @@ DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
 
 depend: $(DEPFILES)
 
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
 ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
 -include $(DEPFILES)
 endif
+endif
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
new file mode 100755
index 000000000..758135c9f
--- /dev/null
+++ b/build_tools/format-diff.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -e
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl https://fburl.com/clang-format-diff"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+
+# Check the format of recently changed lines,
+diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1

From 481c77e526e59accf98ef9a5527ab7fb0e40104b Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Tue, 14 Jan 2014 13:54:33 -0800
Subject: [PATCH 07/27] Move the compilation of the shared libraries to "make
 release"

Compiling the shared libraries took a long time. Thus to speed up the development speed, it still makes sense to be separated from regular compilation.
---
 Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ebf7b96fe..572e42e9e 100644
--- a/Makefile
+++ b/Makefile
@@ -132,14 +132,16 @@ $(SHARED3): $(LIBOBJECTS)
 
 endif  # PLATFORM_SHARED_EXT
 
-all: $(LIBRARY) $(PROGRAMS) $(SHARED)
+all: $(LIBRARY) $(PROGRAMS)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
 	release tags valgrind_check whitebox_crash_test format
 
+# Will also generate shared libraries. 
 release:
 	$(MAKE) clean
-	OPT=-DNDEBUG $(MAKE) -j32
+	OPT=-DNDEBUG $(MAKE) all -j32
+	OPT=-DNDEBUG $(MAKE) $(SHARED) -j32
 
 coverage:
 	$(MAKE) clean

From 7d9f21cf23d5951fe7654972ca99e0a17cffc177 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 14:49:31 -0800
Subject: [PATCH 08/27] BuildBatchGroup -- memcpy outside of lock

Summary: When building batch group, don't actually build a new batch since it requires heavy-weight mem copy and malloc. Only store references to the batches and build the batch group without lock held.

Test Plan:
`make check`

I am also planning to run performance tests. The workload that will benefit from this change is readwhilewriting. I will post the results once I have them.

Reviewers: dhruba, haobo, kailiu

Reviewed By: haobo

CC: leveldb, xjin

Differential Revision: https://reviews.facebook.net/D15063
---
 db/db_impl.cc | 38 +++++++++++++++----------------
 db/db_impl.h  |  4 +++-
 db/db_test.cc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 20 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index b50eb4c44..37e8d7582 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -56,6 +56,7 @@
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -2969,12 +2970,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   uint64_t last_sequence = versions_->LastSequence();
   Writer* last_writer = &w;
   if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
-    // TODO: BuildBatchGroup physically concatenate/copy all write batches into
-    // a new one. Mem copy is done with the lock held. Ideally, we only need
-    // the lock to obtain the last_writer and the references to all batches.
-    // Creation (copy) of the merged batch could have been done outside of the
-    // lock protected region.
-    WriteBatch* updates = BuildBatchGroup(&last_writer);
+    autovector<WriteBatch*> write_batch_group;
+    BuildBatchGroup(&last_writer, &write_batch_group);
 
     // Add to log and apply to memtable.  We can release the lock
     // during this phase since &w is currently responsible for logging
@@ -2982,6 +2979,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     // into mem_.
     {
       mutex_.Unlock();
+      WriteBatch* updates = nullptr;
+      if (write_batch_group.size() == 1) {
+        updates = write_batch_group[0];
+      } else {
+        updates = &tmp_batch_;
+        for (size_t i = 0; i < write_batch_group.size(); ++i) {
+          WriteBatchInternal::Append(updates, write_batch_group[i]);
+        }
+      }
+
       const SequenceNumber current_sequence = last_sequence + 1;
       WriteBatchInternal::SetSequence(updates, current_sequence);
       int my_batch_count = WriteBatchInternal::Count(updates);
@@ -3027,12 +3034,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         SetTickerCount(options_.statistics.get(),
                        SEQUENCE_NUMBER, last_sequence);
       }
+      if (updates == &tmp_batch_) tmp_batch_.Clear();
       mutex_.Lock();
       if (status.ok()) {
         versions_->SetLastSequence(last_sequence);
       }
     }
-    if (updates == &tmp_batch_) tmp_batch_.Clear();
   }
   if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
     bg_error_ = status; // stop compaction & fail any further writes
@@ -3060,13 +3067,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
 // REQUIRES: Writer list must be non-empty
 // REQUIRES: First writer must have a non-nullptr batch
-WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+void DBImpl::BuildBatchGroup(Writer** last_writer,
+                             autovector<WriteBatch*>* write_batch_group) {
   assert(!writers_.empty());
   Writer* first = writers_.front();
-  WriteBatch* result = first->batch;
-  assert(result != nullptr);
+  assert(first->batch != nullptr);
 
   size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
 
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
@@ -3099,18 +3107,10 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
         break;
       }
 
-      // Append to *reuslt
-      if (result == first->batch) {
-        // Switch to temporary batch instead of disturbing caller's batch
-        result = &tmp_batch_;
-        assert(WriteBatchInternal::Count(result) == 0);
-        WriteBatchInternal::Append(result, first->batch);
-      }
-      WriteBatchInternal::Append(result, w->batch);
+      write_batch_group->push_back(w->batch);
     }
     *last_writer = w;
   }
-  return result;
 }
 
 // This function computes the amount of time in microseconds by which a write
diff --git a/db/db_impl.h b/db/db_impl.h
index d33efd19e..d74b77aa4 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -22,6 +22,7 @@
 #include "port/port.h"
 #include "util/stats_logger.h"
 #include "memtablelist.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -291,7 +292,8 @@ class DBImpl : public DB {
   // the superversion outside of mutex
   Status MakeRoomForWrite(bool force /* compact even if there is room? */,
                           SuperVersion** superversion_to_free);
-  WriteBatch* BuildBatchGroup(Writer** last_writer);
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
 
   // Force current memtable contents to be flushed.
   Status FlushMemTable(const FlushOptions& options);
diff --git a/db/db_test.cc b/db/db_test.cc
index a0b3d9aaa..560311ae3 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4333,6 +4333,69 @@ TEST(DBTest, MultiThreaded) {
   } while (ChangeOptions());
 }
 
+// Group commit test:
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // namespace
+
+TEST(DBTest, GroupCommitTest) {
+  do {
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+
+    for (int id = 0; id < kGCNumThreads; id++) {
+      while (thread[id].done == false) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+
+  } while (ChangeOptions());
+}
+
 namespace {
 typedef std::map<std::string, std::string> KVMap;
 }

From 055e6df45b24204feb34461754a482ef7ffc14b6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:27:09 -0800
Subject: [PATCH 09/27] VersionEdit not to take NumLevels()

Summary:
I will submit a sequence of diffs that are preparing master branch for column families. There are a lot of implicit assumptions in the code that are making column family implementation hard. If I make the change only in column family branch, it will make merging back to master impossible.

Most of the diffs will be simple code refactorings, so I hope we can have fast turnaround time. Feel free to grab me in person to discuss any of them.

This diff removes number of level check from VersionEdit. It is used only when VersionEdit is read, not written, but has to be set when it is written. I believe it is a right thing to make VersionEdit dumb and check consistency on the caller side. This will also make it much easier to implement Column Families, since different column families can have different number of levels.

Test Plan: make check

Reviewers: dhruba, haobo, sdong, kailiu

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15159
---
 db/db_impl.cc                       | 30 ++++++++----------
 db/db_impl_readonly.cc              |  2 +-
 db/db_test.cc                       |  9 +++---
 db/memtable.cc                      | 16 ++++------
 db/memtable.h                       |  7 ++---
 db/repair.cc                        |  5 ++-
 db/version_edit.cc                  | 10 +++---
 db/version_edit.h                   | 11 +++----
 db/version_edit_test.cc             |  4 +--
 db/version_set.cc                   | 47 ++++++++++++++++-------------
 db/version_set_reduce_num_levels.cc |  4 +--
 11 files changed, 66 insertions(+), 79 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 37e8d7582..4781ad85d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -252,8 +252,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
       internal_comparator_(options.comparator),
-      options_(SanitizeOptions(
-          dbname, &internal_comparator_, &internal_filter_policy_, options)),
+      options_(SanitizeOptions(dbname, &internal_comparator_,
+                               &internal_filter_policy_, options)),
       internal_filter_policy_(options.filter_policy),
       owns_info_log_(options_.info_log != options.info_log),
       db_lock_(nullptr),
@@ -261,8 +261,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       shutting_down_(nullptr),
       bg_cv_(&mutex_),
       mem_rep_factory_(options_.memtable_factory.get()),
-      mem_(new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_)),
+      mem_(new MemTable(internal_comparator_, options_)),
       logfile_number_(0),
       super_version_(nullptr),
       tmp_batch_(),
@@ -408,7 +407,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
 }
 
 Status DBImpl::NewDB() {
-  VersionEdit new_db(NumberLevels());
+  VersionEdit new_db;
   new_db.SetComparatorName(user_comparator()->Name());
   new_db.SetLogNumber(0);
   new_db.SetNextFile(2);
@@ -864,7 +863,7 @@ void DBImpl::PurgeObsoleteWALFiles() {
 // If externalTable is set, then apply recovered transactions
 // to that table. This is used for readonly mode.
 Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table,
-    bool error_if_log_file_exist) {
+                       bool error_if_log_file_exist) {
   mutex_.AssertHeld();
 
   assert(db_lock_ == nullptr);
@@ -1031,8 +1030,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
     WriteBatchInternal::SetContents(&batch, record);
 
     if (mem == nullptr) {
-      mem = new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_);
+      mem = new MemTable(internal_comparator_, options_);
       mem->Ref();
     }
     status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
@@ -1358,7 +1356,7 @@ void DBImpl::ReFitLevel(int level, int target_level) {
     Log(options_.info_log, "Before refitting:\n%s",
         versions_->current()->DebugString().data());
 
-    VersionEdit edit(NumberLevels());
+    VersionEdit edit;
     for (const auto& f : versions_->current()->files_[level]) {
       edit.DeleteFile(level, f->number);
       edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
@@ -3289,17 +3287,13 @@ Status DBImpl::MakeRoomForWrite(bool force,
         EnvOptions soptions(storage_options_);
         soptions.use_mmap_writes = false;
         DelayLoggingAndReset();
-        s = env_->NewWritableFile(
-            LogFileName(options_.wal_dir, new_log_number),
-            &lfile,
-            soptions
-          );
+        s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
+                                  &lfile, soptions);
         if (s.ok()) {
           // Our final size should be less than write_buffer_size
           // (compression, etc) but err on the side of caution.
           lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
-          memtmp = new MemTable(
-            internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
+          memtmp = new MemTable(internal_comparator_, options_);
           new_superversion = new SuperVersion(options_.max_write_buffer_number);
         }
       }
@@ -3680,7 +3674,7 @@ Status DBImpl::DeleteFile(std::string name) {
   int level;
   FileMetaData metadata;
   int maxlevel = NumberLevels();
-  VersionEdit edit(maxlevel);
+  VersionEdit edit;
   DeletionState deletion_state(0, true);
   {
     MutexLock l(&mutex_);
@@ -3802,7 +3796,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     return s;
   }
   impl->mutex_.Lock();
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
   if (s.ok()) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index dbb297e93..04033b2fa 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -86,7 +86,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
 
   DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
   impl->mutex_.Lock();
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   Status s = impl->Recover(&edit, impl->GetMemTable(),
                            error_if_log_file_exist);
   impl->mutex_.Unlock();
diff --git a/db/db_test.cc b/db/db_test.cc
index 560311ae3..2ff47320a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -765,10 +765,9 @@ TEST(DBTest, LevelLimitReopen) {
   options.num_levels = 1;
   options.max_bytes_for_level_multiplier_additional.resize(1, 1);
   Status s = TryReopen(&options);
-  ASSERT_EQ(s.IsCorruption(), true);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
   ASSERT_EQ(s.ToString(),
-            "Corruption: VersionEdit: db already has "
-            "more levels than options.num_levels");
+            "Invalid argument: db has more levels than options.num_levels");
 
   options.num_levels = 10;
   options.max_bytes_for_level_multiplier_additional.resize(10, 1);
@@ -4936,7 +4935,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   EnvOptions sopt;
   VersionSet vset(dbname, &options, sopt, nullptr, &cmp);
   ASSERT_OK(vset.Recover());
-  VersionEdit vbase(vset.NumberLevels());
+  VersionEdit vbase;
   uint64_t fnum = 1;
   for (int i = 0; i < num_base_files; i++) {
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
@@ -4948,7 +4947,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   uint64_t start_micros = env->NowMicros();
 
   for (int i = 0; i < iters; i++) {
-    VersionEdit vedit(vset.NumberLevels());
+    VersionEdit vedit;
     vedit.DeleteFile(2, fnum);
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
     InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
diff --git a/db/memtable.cc b/db/memtable.cc
index 7881ce5bd..baff4fb34 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -33,24 +33,20 @@ struct hash<rocksdb::Slice> {
 
 namespace rocksdb {
 
-MemTable::MemTable(const InternalKeyComparator& cmp,
-                   MemTableRepFactory* table_factory,
-                   int numlevel,
-                   const Options& options)
+MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
     : comparator_(cmp),
       refs_(0),
       arena_impl_(options.arena_block_size),
-      table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)),
+      table_(options.memtable_factory->CreateMemTableRep(comparator_,
+                                                         &arena_impl_)),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
-      edit_(numlevel),
       first_seqno_(0),
       mem_next_logfile_number_(0),
       mem_logfile_number_(0),
-      locks_(options.inplace_update_support
-             ? options.inplace_update_num_locks
-             : 0) { }
+      locks_(options.inplace_update_support ? options.inplace_update_num_locks
+                                            : 0) {}
 
 MemTable::~MemTable() {
   assert(refs_ == 0);
@@ -58,7 +54,7 @@ MemTable::~MemTable() {
 
 size_t MemTable::ApproximateMemoryUsage() {
   return arena_impl_.ApproximateMemoryUsage() +
-    table_->ApproximateMemoryUsage();
+         table_->ApproximateMemoryUsage();
 }
 
 int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
diff --git a/db/memtable.h b/db/memtable.h
index 12ccf3d37..24a2c852b 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -34,11 +34,8 @@ class MemTable {
 
   // MemTables are reference counted.  The initial reference count
   // is zero and the caller must call Ref() at least once.
-  explicit MemTable(
-    const InternalKeyComparator& comparator,
-    MemTableRepFactory* table_factory,
-    int numlevel = 7,
-    const Options& options = Options());
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const Options& options = Options());
 
   ~MemTable();
 
diff --git a/db/repair.cc b/db/repair.cc
index 6db90c865..29524233f 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -58,7 +58,7 @@ class Repairer {
         next_file_number_(1) {
     // TableCache can be small since we expect each table to be opened once.
     table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
-    edit_ = new VersionEdit(options.num_levels);
+    edit_ = new VersionEdit();
   }
 
   ~Repairer() {
@@ -196,8 +196,7 @@ class Repairer {
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_.memtable_factory.get(),
-      options_.num_levels);
+    MemTable* mem = new MemTable(icmp_, options_);
     mem->Ref();
     int counter = 0;
     while (reader.ReadRecord(&record, &scratch)) {
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 9f23faba7..42c07e7b0 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -33,6 +33,7 @@ enum Tag {
 
 void VersionEdit::Clear() {
   comparator_.clear();
+  max_level_ = 0;
   log_number_ = 0;
   prev_log_number_ = 0;
   last_sequence_ = 0;
@@ -107,14 +108,13 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
 
 bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
   uint32_t v;
-  if (GetVarint32(input, &v) &&
-      (int)v < number_levels_) {
+  if (GetVarint32(input, &v)) {
     *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
     return true;
   } else {
-    if ((int)v >= number_levels_) {
-      *msg = "db already has more levels than options.num_levels";
-    }
     return false;
   }
 }
diff --git a/db/version_edit.h b/db/version_edit.h
index 196914e2b..a0546c983 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -34,10 +34,7 @@ struct FileMetaData {
 
 class VersionEdit {
  public:
-  explicit VersionEdit(int number_levels) :
-      number_levels_(number_levels) {
-    Clear();
-  }
+  VersionEdit() { Clear(); }
   ~VersionEdit() { }
 
   void Clear();
@@ -108,7 +105,7 @@ class VersionEdit {
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
-  int number_levels_;
+  int max_level_;
   std::string comparator_;
   uint64_t log_number_;
   uint64_t prev_log_number_;
@@ -120,9 +117,9 @@ class VersionEdit {
   bool has_next_file_number_;
   bool has_last_sequence_;
 
-  std::vector< std::pair<int, InternalKey> > compact_pointers_;
+  std::vector<std::pair<int, InternalKey> > compact_pointers_;
   DeletedFileSet deleted_files_;
-  std::vector< std::pair<int, FileMetaData> > new_files_;
+  std::vector<std::pair<int, FileMetaData> > new_files_;
 };
 
 }  // namespace rocksdb
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 4a00822f7..745ea90d0 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -15,7 +15,7 @@ namespace rocksdb {
 static void TestEncodeDecode(const VersionEdit& edit) {
   std::string encoded, encoded2;
   edit.EncodeTo(&encoded);
-  VersionEdit parsed(7);
+  VersionEdit parsed();
   Status s = parsed.DecodeFrom(encoded);
   ASSERT_TRUE(s.ok()) << s.ToString();
   parsed.EncodeTo(&encoded2);
@@ -27,7 +27,7 @@ class VersionEditTest { };
 TEST(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
 
-  VersionEdit edit(7);
+  VersionEdit edit();
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
     edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
diff --git a/db/version_set.cc b/db/version_set.cc
index 7a1f5cbf8..91b3dcd3f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -980,14 +980,12 @@ class VersionSet::Builder {
 #endif
   }
 
-  void CheckConsistencyForDeletes(
-    VersionEdit* edit,
-    unsigned int number,
-    int level) {
+  void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number,
+                                  int level) {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      for (int l = 0; !found && l < edit->number_levels_; l++) {
+      for (int l = 0; !found && l < vset_->NumberLevels(); l++) {
         const std::vector<FileMetaData*>& base_files = base_->files_[l];
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
@@ -1000,7 +998,7 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level+1; !found && l < edit->number_levels_; l++) {
+      for (int l = level+1; !found && l < vset_->NumberLevels(); l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1213,7 +1211,7 @@ void VersionSet::AppendVersion(Version* v) {
 }
 
 Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
-    bool new_descriptor_log) {
+                               bool new_descriptor_log) {
   mu->AssertHeld();
 
   // queue our request
@@ -1383,7 +1381,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 }
 
 void VersionSet::LogAndApplyHelper(Builder* builder, Version* v,
-  VersionEdit* edit, port::Mutex* mu) {
+                                   VersionEdit* edit, port::Mutex* mu) {
   mu->AssertHeld();
 
   if (edit->has_log_number_) {
@@ -1455,21 +1453,28 @@ Status VersionSet::Recover() {
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
-      if (s.ok()) {
-        if (edit.has_comparator_ &&
-            edit.comparator_ != icmp_.user_comparator()->Name()) {
-          s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
-                                      "does not match existing comparator " +
-                                      edit.comparator_);
-        }
+      if (!s.ok()) {
+        break;
       }
 
-      if (s.ok()) {
-        builder.Apply(&edit);
+      if (edit.max_level_ >= NumberLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
       }
 
+      if (edit.has_comparator_ &&
+          edit.comparator_ != icmp_.user_comparator()->Name()) {
+        s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
+            "does not match existing comparator " +
+            edit.comparator_);
+        break;
+      }
+
+      builder.Apply(&edit);
+
       if (edit.has_log_number_) {
         log_number = edit.log_number_;
         have_log_number = true;
@@ -1577,7 +1582,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
       if (s.ok()) {
         if (edit.has_comparator_ &&
@@ -1832,7 +1837,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
   // Save metadata
-  VersionEdit edit(NumberLevels());
+  VersionEdit edit;
   edit.SetComparatorName(icmp_.user_comparator()->Name());
 
   // Save compaction pointers
@@ -2994,7 +2999,7 @@ Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
       bottommost_level_(false),
       is_full_compaction_(false),
       level_ptrs_(std::vector<size_t>(number_levels)) {
-  edit_ = new VersionEdit(number_levels_);
+  edit_ = new VersionEdit();
   for (int i = 0; i < number_levels_; i++) {
     level_ptrs_[i] = 0;
   }
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index d13a4aed9..07062399b 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -72,8 +72,8 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   num_levels_ = new_levels;
   compact_pointer_ = new std::string[new_levels];
   Init(new_levels);
-  VersionEdit ve(new_levels);
-  st = LogAndApply(&ve , mu, true);
+  VersionEdit ve;
+  st = LogAndApply(&ve, mu, true);
   return st;
 }
 

From 7f3e417f59e9f398abca74ceef370b9861cb7523 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:32:37 -0800
Subject: [PATCH 10/27] Fix memtable construction in tests

---
 db/version_edit_test.cc |  4 ++--
 db/write_batch_test.cc  |  5 +++--
 table/table_test.cc     | 13 +++++++++----
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 745ea90d0..63aa32e8f 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -15,7 +15,7 @@ namespace rocksdb {
 static void TestEncodeDecode(const VersionEdit& edit) {
   std::string encoded, encoded2;
   edit.EncodeTo(&encoded);
-  VersionEdit parsed();
+  VersionEdit parsed;
   Status s = parsed.DecodeFrom(encoded);
   ASSERT_TRUE(s.ok()) << s.ToString();
   parsed.EncodeTo(&encoded2);
@@ -27,7 +27,7 @@ class VersionEditTest { };
 TEST(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
 
-  VersionEdit edit();
+  VersionEdit edit;
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
     edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index ff9aa63ee..931d8f3f5 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -22,10 +22,11 @@ namespace rocksdb {
 static std::string PrintContents(WriteBatch* b) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
-  MemTable* mem = new MemTable(cmp, factory.get());
+  Options options;
+  options.memtable_factory = factory;
+  MemTable* mem = new MemTable(cmp, options);
   mem->Ref();
   std::string state;
-  Options options;
   Status s = WriteBatchInternal::InsertInto(b, mem, &options);
   int count = 0;
   Iterator* iter = mem->NewIterator();
diff --git a/table/table_test.cc b/table/table_test.cc
index 1f79fcdf9..d404e0b2a 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -370,7 +370,9 @@ class MemTableConstructor: public Constructor {
       : Constructor(cmp),
         internal_comparator_(cmp),
         table_factory_(new SkipListFactory) {
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options options;
+    options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, options);
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -378,7 +380,9 @@ class MemTableConstructor: public Constructor {
   }
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
     delete memtable_->Unref();
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options memtable_options;
+    memtable_options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, memtable_options);
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -1268,10 +1272,11 @@ class MemTableTest { };
 TEST(MemTableTest, Simple) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto table_factory = std::make_shared<SkipListFactory>();
-  MemTable* memtable = new MemTable(cmp, table_factory.get());
+  Options options;
+  options.memtable_factory = table_factory;
+  MemTable* memtable = new MemTable(cmp, options);
   memtable->Ref();
   WriteBatch batch;
-  Options options;
   WriteBatchInternal::SetSequence(&batch, 100);
   batch.Put(std::string("k1"), std::string("v1"));
   batch.Put(std::string("k2"), std::string("v2"));

From 62910202844a6abe5fb5eb23fa0195a3d34d3cf9 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:41:30 -0800
Subject: [PATCH 11/27] Fix test

---
 db/db_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index 2ff47320a..6e7a2edc2 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4391,6 +4391,7 @@ TEST(DBTest, GroupCommitTest) {
       itr->Next();
     }
     ASSERT_TRUE(!itr->Valid());
+    delete itr;
 
   } while (ChangeOptions());
 }

From 1ed2404f27b351def723ecd59dd646007e500b3f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:54:11 -0800
Subject: [PATCH 12/27] Wrong number of levels is Invalid argument now, not
 corruption

---
 db/db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 6e7a2edc2..91970381f 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3397,7 +3397,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
   opts.create_if_missing = false;
   opts.num_levels = 2;
   s = DB::Open(opts, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "Corruption") != nullptr);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
   ASSERT_TRUE(db == nullptr);
 }
 

From d9cd7a063f919d4a57334932e57b31571ce87ddc Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 16:19:09 -0800
Subject: [PATCH 13/27] Fix CompactRange to apply filter to every key

Summary:
When doing CompactRange(), we should first flush the memtable and then calculate max_level_with_files. Also, we want to compact all the levels that have files, including level `max_level_with_files`.

This patch fixed the unit test.

Test Plan: Added a failing unit test and a fix, so it's not failing anymore.

Reviewers: dhruba, haobo, sdong

Reviewed By: haobo

CC: leveldb, xjin

Differential Revision: https://reviews.facebook.net/D14421
---
 db/db_impl.cc                  | 85 +++++++++++++++++++++++-----------
 db/db_impl.h                   | 12 ++++-
 db/db_test.cc                  | 68 ++++++++++++++++-----------
 db/version_set.cc              | 51 +++++++++++++-------
 db/version_set.h               | 16 +++++--
 include/rocksdb/db.h           |  1 +
 util/manual_compaction_test.cc | 75 +++++++++++++++++++++++++++---
 7 files changed, 222 insertions(+), 86 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 4781ad85d..908ede5b4 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1278,8 +1278,11 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
   return s;
 }
 
-void DBImpl::CompactRange(const Slice* begin, const Slice* end,
-                          bool reduce_level, int target_level) {
+void DBImpl::CompactRange(const Slice* begin,
+                          const Slice* end,
+                          bool reduce_level,
+                          int target_level) {
+  FlushMemTable(FlushOptions());
   int max_level_with_files = 1;
   {
     MutexLock l(&mutex_);
@@ -1290,9 +1293,15 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end,
       }
     }
   }
-  TEST_FlushMemTable(); // TODO(sanjay): Skip if memtable does not overlap
-  for (int level = 0; level < max_level_with_files; level++) {
-    TEST_CompactRange(level, begin, end);
+  for (int level = 0; level <= max_level_with_files; level++) {
+    // in case the compaction is unversal or if we're compacting the
+    // bottom-most level, the output level will be the same as input one
+    if (options_.compaction_style == kCompactionStyleUniversal ||
+        level == max_level_with_files) {
+      RunManualCompaction(level, level, begin, end);
+    } else {
+      RunManualCompaction(level, level + 1, begin, end);
+    }
   }
 
   if (reduce_level) {
@@ -1591,13 +1600,17 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path,
   return status;
 }
 
-void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
-  assert(level >= 0);
+void DBImpl::RunManualCompaction(int input_level,
+                                 int output_level,
+                                 const Slice* begin,
+                                 const Slice* end) {
+  assert(input_level >= 0);
 
   InternalKey begin_storage, end_storage;
 
   ManualCompaction manual;
-  manual.level = level;
+  manual.input_level = input_level;
+  manual.output_level = output_level;
   manual.done = false;
   manual.in_progress = false;
   // For universal compaction, we enforce every manual compaction to compact
@@ -1625,11 +1638,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   // can compact any range of keys/files.
   //
   // bg_manual_only_ is non-zero when at least one thread is inside
-  // TEST_CompactRange(), i.e. during that time no other compaction will
+  // RunManualCompaction(), i.e. during that time no other compaction will
   // get scheduled (see MaybeScheduleFlushOrCompaction).
   //
   // Note that the following loop doesn't stop more that one thread calling
-  // TEST_CompactRange() from getting to the second while loop below.
+  // RunManualCompaction() from getting to the second while loop below.
   // However, only one of them will actually schedule compaction, while
   // others will wait on a condition variable until it completes.
 
@@ -1659,6 +1672,15 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   --bg_manual_only_;
 }
 
+void DBImpl::TEST_CompactRange(int level,
+                               const Slice* begin,
+                               const Slice* end) {
+  int output_level = (options_.compaction_style == kCompactionStyleUniversal)
+                         ? level
+                         : level + 1;
+  RunManualCompaction(level, output_level, begin, end);
+}
+
 Status DBImpl::FlushMemTable(const FlushOptions& options) {
   // nullptr batch means just wait for earlier writes to be done
   Status s = Write(WriteOptions(), nullptr);
@@ -1878,23 +1900,27 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   unique_ptr<Compaction> c;
   bool is_manual = (manual_compaction_ != nullptr) &&
                    (manual_compaction_->in_progress == false);
-  InternalKey manual_end;
+  InternalKey manual_end_storage;
+  InternalKey* manual_end = &manual_end_storage;
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
     assert(!m->in_progress);
     m->in_progress = true; // another thread cannot pick up the same work
-    c.reset(versions_->CompactRange(m->level, m->begin, m->end));
-    if (c) {
-      manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
-    } else {
+    c.reset(versions_->CompactRange(
+        m->input_level, m->output_level, m->begin, m->end, &manual_end));
+    if (!c) {
       m->done = true;
     }
     Log(options_.info_log,
-        "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
-        m->level,
+        "Manual compaction from level-%d to level-%d from %s .. %s; will stop "
+        "at %s\n",
+        m->input_level,
+        m->output_level,
         (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
         (m->end ? m->end->DebugString().c_str() : "(end)"),
-        (m->done ? "(end)" : manual_end.DebugString().c_str()));
+        ((m->done || manual_end == nullptr)
+             ? "(end)"
+             : manual_end->DebugString().c_str()));
   } else if (!options_.disable_auto_compactions) {
     c.reset(versions_->PickCompaction());
   }
@@ -1959,13 +1985,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     //   Also note that, if we don't stop here, then the current compaction
     //   writes a new file back to level 0, which will be used in successive
     //   compaction. Hence the manual compaction will never finish.
-    if (options_.compaction_style == kCompactionStyleUniversal) {
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (manual_end == nullptr) {
       m->done = true;
     }
     if (!m->done) {
       // We only compacted part of the requested range.  Update *m
       // to the range that is left to be compacted.
-      m->tmp_storage = manual_end;
+      // Universal compaction should always compact the whole range
+      assert(options_.compaction_style != kCompactionStyleUniversal);
+      m->tmp_storage = *manual_end;
       m->begin = &m->tmp_storage;
     }
     m->in_progress = false; // not being processed anymore
@@ -1997,14 +2029,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
 }
 
 // Allocate the file numbers for the output file. We allocate as
-// many output file numbers as there are files in level+1.
+// many output file numbers as there are files in level+1 (at least one)
 // Insert them into pending_outputs so that they do not get deleted.
 void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
   mutex_.AssertHeld();
   assert(compact != nullptr);
   assert(compact->builder == nullptr);
   int filesNeeded = compact->compaction->num_input_files(1);
-  for (int i = 0; i < filesNeeded; i++) {
+  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
     uint64_t file_number = versions_->NewFileNumber();
     pending_outputs_.insert(file_number);
     compact->allocated_file_numbers.push_back(file_number);
@@ -2148,14 +2180,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
 
   // Add compaction outputs
   compact->compaction->AddInputDeletions(compact->compaction->edit());
-  const int level = compact->compaction->level();
   for (size_t i = 0; i < compact->outputs.size(); i++) {
     const CompactionState::Output& out = compact->outputs[i];
     compact->compaction->edit()->AddFile(
-        (options_.compaction_style == kCompactionStyleUniversal) ?
-          level : level + 1,
-        out.number, out.file_size, out.smallest, out.largest,
-        out.smallest_seqno, out.largest_seqno);
+        compact->compaction->output_level(), out.number, out.file_size,
+        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
   }
   return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
 }
@@ -2197,7 +2226,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       compact->compaction->num_input_files(0),
       compact->compaction->level(),
       compact->compaction->num_input_files(1),
-      compact->compaction->level() + 1,
+      compact->compaction->output_level(),
       compact->compaction->score(),
       options_.max_background_compactions - bg_compaction_scheduled_);
   char scratch[256];
diff --git a/db/db_impl.h b/db/db_impl.h
index d74b77aa4..476b2bf54 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -89,10 +89,17 @@ class DBImpl : public DB {
 
   virtual Status GetDbIdentity(std::string& identity);
 
+  void RunManualCompaction(int input_level,
+                           int output_level,
+                           const Slice* begin,
+                           const Slice* end);
+
   // Extra methods (for testing) that are not in the public DB interface
 
   // Compact any files in the named level that overlap [*begin, *end]
-  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+  void TEST_CompactRange(int level,
+                         const Slice* begin,
+                         const Slice* end);
 
   // Force current memtable contents to be flushed.
   Status TEST_FlushMemTable();
@@ -406,7 +413,8 @@ class DBImpl : public DB {
 
   // Information for a manual compaction
   struct ManualCompaction {
-    int level;
+    int input_level;
+    int output_level;
     bool done;
     bool in_progress;           // compaction request being processed?
     const InternalKey* begin;   // nullptr means beginning of key range
diff --git a/db/db_test.cc b/db/db_test.cc
index 91970381f..9c8a97f93 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3309,34 +3309,46 @@ TEST(DBTest, ManualCompaction) {
   ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
       << "Need to update this test to match kMaxMemCompactLevel";
 
-  MakeTables(3, "p", "q");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
-
-  // Compaction range falls before files
-  Compact("", "c");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
-
-  // Compaction range falls after files
-  Compact("r", "z");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
-
-  // Compaction range overlaps files
-  Compact("p1", "p9");
-  ASSERT_EQ("0,0,1", FilesPerLevel());
-
-  // Populate a different range
-  MakeTables(3, "c", "e");
-  ASSERT_EQ("1,1,2", FilesPerLevel());
-
-  // Compact just the new range
-  Compact("b", "f");
-  ASSERT_EQ("0,0,2", FilesPerLevel());
-
-  // Compact all
-  MakeTables(1, "a", "z");
-  ASSERT_EQ("0,1,2", FilesPerLevel());
-  db_->CompactRange(nullptr, nullptr);
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
+
+    // Compaction range falls before files
+    Compact("", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
+
+    // Compaction range falls after files
+    Compact("r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
+
+    // Compaction range overlaps files
+    Compact("p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel());
+
+    // Populate a different range
+    MakeTables(3, "c", "e");
+    ASSERT_EQ("1,1,2", FilesPerLevel());
+
+    // Compact just the new range
+    Compact("b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel());
+
+    // Compact all
+    MakeTables(1, "a", "z");
+    ASSERT_EQ("0,1,2", FilesPerLevel());
+    db_->CompactRange(nullptr, nullptr);
+    ASSERT_EQ("0,0,1", FilesPerLevel());
+
+    if (iter == 0) {
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(&options);
+    }
+  }
+
 }
 
 TEST(DBTest, DBOpen_Options) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 91b3dcd3f..a411ea210 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2715,6 +2715,7 @@ Compaction* VersionSet::PickCompaction() {
 bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
   const InternalKey* largest, int level, int* parent_index) {
   std::vector<FileMetaData*> inputs;
+  assert(level + 1 < NumberLevels());
 
   current_->GetOverlappingInputs(level+1, smallest, largest,
                                  &inputs, *parent_index, parent_index);
@@ -2776,7 +2777,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
   // compaction, then we must drop/cancel this compaction.
   int parent_index = -1;
   if (FilesInCompaction(c->inputs_[0]) ||
-      ParentRangeInCompaction(&smallest, &largest, level, &parent_index)) {
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(&smallest, &largest, level, &parent_index))) {
     c->inputs_[0].clear();
     c->inputs_[1].clear();
     delete c;
@@ -2790,7 +2792,9 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
 // user-key with another file.
 void VersionSet::SetupOtherInputs(Compaction* c) {
   // If inputs are empty, then there is nothing to expand.
-  if (c->inputs_[0].empty()) {
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
     return;
   }
 
@@ -2918,11 +2922,13 @@ void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
   obsolete_files_.clear();
 }
 
-Compaction* VersionSet::CompactRange(
-    int level,
-    const InternalKey* begin,
-    const InternalKey* end) {
+Compaction* VersionSet::CompactRange(int input_level,
+                                     int output_level,
+                                     const InternalKey* begin,
+                                     const InternalKey* end,
+                                     InternalKey** compaction_end) {
   std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
 
   // All files are 'overlapping' in universal style compaction.
   // We have to compact the entire range in one shot.
@@ -2930,7 +2936,7 @@ Compaction* VersionSet::CompactRange(
     begin = nullptr;
     end = nullptr;
   }
-  current_->GetOverlappingInputs(level, begin, end, &inputs);
+  current_->GetOverlappingInputs(input_level, begin, end, &inputs);
   if (inputs.empty()) {
     return nullptr;
   }
@@ -2939,24 +2945,26 @@ Compaction* VersionSet::CompactRange(
   // But we cannot do this for level-0 since level-0 files can overlap
   // and we must not pick one file and drop another older file if the
   // two files overlap.
-  if (level > 0) {
-    const uint64_t limit = MaxFileSizeForLevel(level) *
-                         options_->source_compaction_factor;
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
     uint64_t total = 0;
-    for (size_t i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
       uint64_t s = inputs[i]->file_size;
       total += s;
       if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
         inputs.resize(i + 1);
         break;
       }
     }
   }
-  int out_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  level : level+1;
-
-  Compaction* c = new Compaction(level, out_level, MaxFileSizeForLevel(out_level),
-    MaxGrandParentOverlapBytes(level), NumberLevels());
+  Compaction* c = new Compaction(input_level,
+                                 output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level),
+                                 NumberLevels());
 
   c->inputs_[0] = inputs;
   ExpandWhileOverlapping(c);
@@ -2969,6 +2977,10 @@ Compaction* VersionSet::CompactRange(
   c->input_version_->Ref();
   SetupOtherInputs(c);
 
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
   // These files that are to be manaully compacted do not trample
   // upon other files because manual compactions are processed when
   // the system has a max of 1 background compaction thread.
@@ -3016,7 +3028,10 @@ bool Compaction::IsTrivialMove() const {
   // Avoid a move if there is lots of overlapping grandparent data.
   // Otherwise, the move could create a parent file that will require
   // a very expensive merge later on.
-  return (num_input_files(0) == 1 &&
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
           num_input_files(1) == 0 &&
           TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
 }
@@ -3109,7 +3124,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) {
   }
   bottommost_level_ = true;
   int num_levels = input_version_->vset_->NumberLevels();
-  for (int i = level() + 2; i < num_levels; i++) {
+  for (int i = output_level() + 1; i < num_levels; i++) {
     if (input_version_->vset_->NumLevelFiles(i) > 0) {
       bottommost_level_ = false;
       break;
diff --git a/db/version_set.h b/db/version_set.h
index 85ff2ff36..2c91532b5 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -310,10 +310,18 @@ class VersionSet {
   // the specified level.  Returns nullptr if there is nothing in that
   // level that overlaps the specified range.  Caller should delete
   // the result.
-  Compaction* CompactRange(
-      int level,
-      const InternalKey* begin,
-      const InternalKey* end);
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(int input_level,
+                           int output_level,
+                           const InternalKey* begin,
+                           const InternalKey* end,
+                           InternalKey** compaction_end);
 
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index dd17d9e9b..4bf095756 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -199,6 +199,7 @@ class DB {
                                    uint64_t* sizes) = 0;
 
   // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
   // In particular, deleted and overwritten versions are discarded,
   // and the data is rearranged to reduce the cost of operations
   // needed to access the data.  This operation should typically only
diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc
index ebe1339e5..dd615f057 100644
--- a/util/manual_compaction_test.cc
+++ b/util/manual_compaction_test.cc
@@ -9,9 +9,13 @@
 #include <cstdlib>
 
 #include "rocksdb/db.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "util/testharness.h"
 
+using namespace rocksdb;
+
 namespace {
 
 const int kNumKeys = 1100000;
@@ -26,12 +30,71 @@ std::string Key2(int i) {
   return Key1(i) + "_xxx";
 }
 
-class ManualCompactionTest { };
+class ManualCompactionTest {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
+    DestroyDB(dbname_, rocksdb::Options());
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const {
+    return existing_value.ToString() == "destroy";
+  }
+
+  virtual const char* Name() const {
+    return "DestroyAllCompactionFilter";
+  }
+};
+
+TEST(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) { // level compaction
+      options.num_levels = 3;
+      options.compaction_style = kCompactionStyleLevel;
+    } else { // universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = rocksdb::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+    Slice key4("key4");
+    db->CompactRange(nullptr, &key4);
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    DestroyDB(dbname_, options);
+  }
+}
 
 TEST(ManualCompactionTest, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
-  DestroyDB(dbpath, rocksdb::Options());
 
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
@@ -40,7 +103,7 @@ TEST(ManualCompactionTest, Test) {
   rocksdb::Options db_options;
   db_options.create_if_missing = true;
   db_options.compression = rocksdb::kNoCompression;
-  ASSERT_OK(rocksdb::DB::Open(db_options, dbpath, &db));
+  ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
 
   // create first key range
   rocksdb::WriteBatch batch;
@@ -83,7 +146,7 @@ TEST(ManualCompactionTest, Test) {
 
   // close database
   delete db;
-  DestroyDB(dbpath, rocksdb::Options());
+  DestroyDB(dbname_, rocksdb::Options());
 }
 
 }  // anonymous namespace

From 9b51af5a17f3cfd754575894e090dd867fb47740 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 12 Dec 2013 10:54:03 -0800
Subject: [PATCH 14/27] [RocksDB Performance Branch]
 DBImpl.NewInternalIterator() to reduce works inside mutex

Summary: To reduce mutex contention caused by DBImpl.NewInternalIterator(), in this function, move all the iteration creation works out of mutex, only leaving object ref and get.

Test Plan:
make all check
will run db_stress for a while too to make sure no problem.

Reviewers: haobo, dhruba, kailiu

Reviewed By: haobo

CC: igor, leveldb

Differential Revision: https://reviews.facebook.net/D14589

Conflicts:
	db/db_impl.cc
---
 db/db_impl.cc | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 908ede5b4..07ac5c9d0 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2657,38 +2657,40 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
   IterState* cleanup = new IterState;
-  mutex_.Lock();
-  *latest_snapshot = versions_->LastSequence();
+  MemTable* mutable_mem;
+  std::vector<MemTable*> immutables;
+  Version* version;
 
   // Collect together all needed child iterators for mem
-  std::vector<Iterator*> list;
+  mutex_.Lock();
+  *latest_snapshot = versions_->LastSequence();
   mem_->Ref();
-  list.push_back(mem_->NewIterator(options));
-
-  cleanup->mem.push_back(mem_);
-
+  mutable_mem = mem_;
   // Collect together all needed child iterators for imm_
-  std::vector<MemTable*> immutables;
   imm_.GetMemTables(&immutables);
   for (unsigned int i = 0; i < immutables.size(); i++) {
-    MemTable* m = immutables[i];
-    m->Ref();
+    immutables[i]->Ref();
+  }
+  // Collect iterators for files in L0 - Ln
+  versions_->current()->Ref();
+  version = versions_->current();
+  mutex_.Unlock();
+
+  std::vector<Iterator*> list;
+  list.push_back(mutable_mem->NewIterator(options));
+  cleanup->mem.push_back(mutable_mem);
+  for (MemTable* m : immutables) {
     list.push_back(m->NewIterator(options));
     cleanup->mem.push_back(m);
   }
-
-  // Collect iterators for files in L0 - Ln
-  versions_->current()->AddIterators(options, storage_options_, &list);
+  version->AddIterators(options, storage_options_, &list);
   Iterator* internal_iter =
       NewMergingIterator(&internal_comparator_, &list[0], list.size());
-  versions_->current()->Ref();
-
+  cleanup->version = version;
   cleanup->mu = &mutex_;
   cleanup->db = this;
-  cleanup->version = versions_->current();
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
-  mutex_.Unlock();
   return internal_iter;
 }
 

From c8f16221ed9e0f23b8f11f046e7dddaf6472d2ea Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Tue, 14 Jan 2014 18:03:56 -0800
Subject: [PATCH 15/27] Fix the return type of WriteBatch::Data().

Summary: Quick fix for https://reviews.facebook.net/D15123

Test Plan: Make check

Reviewers: sdong, vkrest

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15165
---
 include/rocksdb/write_batch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index e7ce16005..2cfb731f6 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -88,7 +88,7 @@ class WriteBatch {
   Status Iterate(Handler* handler) const;
 
   // Retrieve the serialized version of this batch.
-  std::string Data() const { return rep_; }
+  const std::string& Data() const { return rep_; }
 
   // Retrieve data size of the batch.
   size_t GetDataSize() const { return rep_.size(); }

From 65a8a52b546cf5eec3c2895d220fd343353585d2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:15:43 -0800
Subject: [PATCH 16/27] Decrease reliance on VersionSet::NumberLevels()

Summary:
With column families VersionSet will not have a constant number of levels (each CF can have different options), so we'll need to eliminate call to VersionSet::NumberLevels()

This diff decreases number of callsites, but we're not there yet. It associates number of levels with Version (each version is associated with single CF) instead of VersionSet.

I have also slightly changed how VersionSet keeps track of manifest size.

This diff also modifies constructor of Compaction such that it takes input_version and automatically Ref()s it. Before this was done outside of constructor.

In next diffs I will continue to decrease number of callsites of VersionSet::NumberLevels() and also references to current_

Test Plan: make check

Reviewers: haobo, dhruba, kailiu, sdong

Reviewed By: sdong

Differential Revision: https://reviews.facebook.net/D15171
---
 db/db_impl.cc                       |  26 +--
 db/db_stats_logger.cc               |   5 +-
 db/version_set.cc                   | 254 +++++++++++++---------------
 db/version_set.h                    |  27 ++-
 db/version_set_reduce_num_levels.cc |   5 +-
 util/ldb_cmd.cc                     |   2 +-
 6 files changed, 153 insertions(+), 166 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 07ac5c9d0..cffcbdfef 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1316,7 +1316,7 @@ int DBImpl::FindMinimumEmptyLevelFitting(int level) {
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (versions_->NumLevelFiles(i) > 0) break;
+    if (versions_->current()->NumLevelFiles(i) > 0) break;
 
     // stop if level i is too small (cannot fit the level files)
     if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break;
@@ -2233,7 +2233,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   compact->compaction->Summary(scratch, sizeof(scratch));
   Log(options_.info_log, "Compaction start summary: %s\n", scratch);
 
-  assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0);
   assert(compact->builder == nullptr);
   assert(!compact->outfile);
 
@@ -3207,7 +3207,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       {
         StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
         env_->SleepForMicroseconds(
-          SlowdownAmount(versions_->NumLevelFiles(0),
+          SlowdownAmount(versions_->current()->NumLevelFiles(0),
                          options_.level0_slowdown_writes_trigger,
                          options_.level0_stop_writes_trigger)
         );
@@ -3242,7 +3242,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
                  STALL_MEMTABLE_COMPACTION_MICROS, stall);
       stall_memtable_compaction_ += stall;
       stall_memtable_compaction_count_++;
-    } else if (versions_->NumLevelFiles(0) >=
+    } else if (versions_->current()->NumLevelFiles(0) >=
                options_.level0_stop_writes_trigger) {
       // There are too many level-0 files.
       DelayLoggingAndReset();
@@ -3372,6 +3372,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
   value->clear();
 
   MutexLock l(&mutex_);
+  Version* current = versions_->current();
   Slice in = property;
   Slice prefix("rocksdb.");
   if (!in.starts_with(prefix)) return false;
@@ -3386,7 +3387,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
     } else {
       char buf[100];
       snprintf(buf, sizeof(buf), "%d",
-               versions_->NumLevelFiles(static_cast<int>(level)));
+               current->NumLevelFiles(static_cast<int>(level)));
       *value = buf;
       return true;
     }
@@ -3401,7 +3402,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
       snprintf(buf, sizeof(buf),
                "%3d %8d %8.0f\n",
                level,
-               versions_->NumLevelFiles(level),
+               current->NumLevelFiles(level),
                versions_->NumLevelBytes(level) / 1048576.0);
       value->append(buf);
     }
@@ -3446,7 +3447,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
              );
     value->append(buf);
     for (int level = 0; level < NumberLevels(); level++) {
-      int files = versions_->NumLevelFiles(level);
+      int files = current->NumLevelFiles(level);
       if (stats_[level].micros > 0 || files > 0) {
         int64_t bytes_read = stats_[level].bytes_readn +
                              stats_[level].bytes_readnp1;
@@ -3728,7 +3729,7 @@ Status DBImpl::DeleteFile(std::string name) {
     // This is to make sure that any deletion tombstones are not
     // lost. Check that the level passed is the last level.
     for (int i = level + 1; i < maxlevel; i++) {
-      if (versions_->NumLevelFiles(i) != 0) {
+      if (versions_->current()->NumLevelFiles(i) != 0) {
         Log(options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
         return Status::InvalidArgument("File not in last level");
@@ -3853,12 +3854,11 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
       impl->MaybeScheduleLogDBDeployStats();
     }
   }
-  impl->mutex_.Unlock();
 
-  if (impl->options_.compaction_style == kCompactionStyleUniversal) {
-    int num_files;
+  if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) {
+    Version* current = impl->versions_->current();
     for (int i = 1; i < impl->NumberLevels(); i++) {
-      num_files = impl->versions_->NumLevelFiles(i);
+      int num_files = current->NumLevelFiles(i);
       if (num_files > 0) {
         s = Status::InvalidArgument("Not all files are at level 0. Cannot "
           "open with universal compaction style.");
@@ -3867,6 +3867,8 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     }
   }
 
+  impl->mutex_.Unlock();
+
   if (s.ok()) {
     *dbptr = impl;
   } else {
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
index 91810abe3..0fd6dd805 100644
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@@ -65,8 +65,9 @@ void DBImpl::LogDBDeployStats() {
 
   uint64_t file_total_size = 0;
   uint32_t file_total_num = 0;
-  for (int i = 0; i < versions_->NumberLevels(); i++) {
-    file_total_num += versions_->NumLevelFiles(i);
+  Version* current = versions_->current();
+  for (int i = 0; i < current->NumberLevels(); i++) {
+    file_total_num += current->NumLevelFiles(i);
     file_total_size += versions_->NumLevelBytes(i);
   }
 
diff --git a/db/version_set.cc b/db/version_set.cc
index a411ea210..b4c1b2233 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -45,7 +45,7 @@ Version::~Version() {
   next_->prev_ = prev_;
 
   // Drop references to files
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     for (size_t i = 0; i < files_[level].size(); i++) {
       FileMetaData* f = files_[level][i];
       assert(f->refs > 0);
@@ -265,7 +265,7 @@ void Version::AddIterators(const ReadOptions& options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < vset_->NumberLevels(); level++) {
+  for (int level = 1; level < num_levels_; level++) {
     if (!files_[level].empty()) {
       iters->push_back(NewConcatenatingIterator(options, soptions, level));
     }
@@ -404,17 +404,19 @@ static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
 }
 
 Version::Version(VersionSet* vset, uint64_t version_number)
-    : vset_(vset), next_(this), prev_(this), refs_(0),
-      files_(new std::vector<FileMetaData*>[vset->NumberLevels()]),
-      files_by_size_(vset->NumberLevels()),
-      next_file_to_compact_by_size_(vset->NumberLevels()),
+    : vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      num_levels_(vset->num_levels_),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      files_by_size_(num_levels_),
+      next_file_to_compact_by_size_(num_levels_),
       file_to_compact_(nullptr),
       file_to_compact_level_(-1),
-      compaction_score_(vset->NumberLevels()),
-      compaction_level_(vset->NumberLevels()),
-      offset_manifest_file_(0),
-      version_number_(version_number) {
-}
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      version_number_(version_number) {}
 
 void Version::Get(const ReadOptions& options,
                   const LookupKey& k,
@@ -453,7 +455,7 @@ void Version::Get(const ReadOptions& options,
   // levels.  Therefore we are guaranteed that if we find data
   // in an smaller level, later levels are irrelevant (unless we
   // are MergeInProgress).
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     size_t num_files = files_[level].size();
     if (num_files == 0) continue;
 
@@ -622,7 +624,7 @@ int Version::PickLevelForMemTableOutput(
       if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
         break;
       }
-      if (level + 2 >= vset_->NumberLevels()) {
+      if (level + 2 >= num_levels_) {
         level++;
         break;
       }
@@ -857,7 +859,7 @@ bool Version::HasOverlappingUserKey(
 
 std::string Version::DebugString(bool hex) const {
   std::string r;
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     // E.g.,
     //   --- level 1 ---
     //   17:123['a' .. 'd']
@@ -926,20 +928,18 @@ class VersionSet::Builder {
 
  public:
   // Initialize a builder with the files from *base and other info from *vset
-  Builder(VersionSet* vset, Version* base)
-      : vset_(vset),
-        base_(base) {
+  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
     base_->Ref();
-    levels_ = new LevelState[vset_->NumberLevels()];
+    levels_ = new LevelState[base->NumberLevels()];
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base->NumberLevels(); level++) {
       levels_[level].added_files = new FileSet(cmp);
     }
   }
 
   ~Builder() {
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       const FileSet* added = levels_[level].added_files;
       std::vector<FileMetaData*> to_unref;
       to_unref.reserve(added->size());
@@ -962,7 +962,7 @@ class VersionSet::Builder {
 
   void CheckConsistency(Version* v) {
 #ifndef NDEBUG
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       // Make sure there is no overlap in levels > 0
       if (level > 0) {
         for (uint32_t i = 1; i < v->files_[level].size(); i++) {
@@ -985,7 +985,7 @@ class VersionSet::Builder {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      for (int l = 0; !found && l < vset_->NumberLevels(); l++) {
+      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
         const std::vector<FileMetaData*>& base_files = base_->files_[l];
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
@@ -998,7 +998,7 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level+1; !found && l < vset_->NumberLevels(); l++) {
+      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1081,7 +1081,7 @@ class VersionSet::Builder {
     CheckConsistency(v);
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
       const std::vector<FileMetaData*>& base_files = base_->files_[level];
@@ -1128,8 +1128,7 @@ class VersionSet::Builder {
   }
 };
 
-VersionSet::VersionSet(const std::string& dbname,
-                       const Options* options,
+VersionSet::VersionSet(const std::string& dbname, const Options* options,
                        const EnvOptions& storage_options,
                        TableCache* table_cache,
                        const InternalKeyComparator* cmp)
@@ -1149,9 +1148,9 @@ VersionSet::VersionSet(const std::string& dbname,
       need_slowdown_for_num_level0_files(false),
       compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
-      last_observed_manifest_size_(0),
+      manifest_file_size_(0),
       storage_options_(storage_options),
-      storage_options_compactions_(storage_options_)  {
+      storage_options_compactions_(storage_options_) {
   compact_pointer_ = new std::string[options_->num_levels];
   Init(options_->num_levels);
   AppendVersion(new Version(this, current_version_number_++));
@@ -1200,7 +1199,7 @@ void VersionSet::AppendVersion(Version* v) {
   current_ = v;
   need_slowdown_for_num_level0_files =
       (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
-       NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
+       v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
 
   // Append to linked list
@@ -1250,7 +1249,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
   //  No need to perform this check if a new Manifest is being created anyways.
   if (!descriptor_log_ ||
-      last_observed_manifest_size_ > options_->max_manifest_file_size) {
+      manifest_file_size_ > options_->max_manifest_file_size) {
     new_descriptor_log = true;
     manifest_file_number_ = NewFileNumber(); // Change manifest file no.
   }
@@ -1264,7 +1263,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
   // because &w is ensuring that all new writes get queued.
   {
     // calculate the amount of data being compacted at every level
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
 
     mu->Unlock();
@@ -1340,14 +1339,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
     LogFlush(options_->info_log);
     mu->Lock();
-    // cache the manifest_file_size so that it can be used to rollover in the
-    // next call to LogAndApply
-    last_observed_manifest_size_ = new_manifest_file_size;
   }
 
   // Install the new version
   if (s.ok()) {
-    v->offset_manifest_file_ = new_manifest_file_size;
+    manifest_file_size_ = new_manifest_file_size;
     AppendVersion(v);
     log_number_ = edit->log_number_;
     prev_log_number_ = edit->prev_log_number_;
@@ -1459,7 +1455,7 @@ Status VersionSet::Recover() {
         break;
       }
 
-      if (edit.max_level_ >= NumberLevels()) {
+      if (edit.max_level_ >= current_->NumberLevels()) {
         s = Status::InvalidArgument(
             "db has more levels than options.num_levels");
         break;
@@ -1520,11 +1516,11 @@ Status VersionSet::Recover() {
     builder.SaveTo(v);
 
     // Install recovered version
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
     Finalize(v, size_being_compacted);
 
-    v->offset_manifest_file_ = manifest_file_size;
+    manifest_file_size_ = manifest_file_size;
     AppendVersion(v);
     manifest_file_number_ = next_file;
     next_file_number_ = next_file + 1;
@@ -1548,7 +1544,7 @@ Status VersionSet::Recover() {
 }
 
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
-    bool verbose, bool hex) {
+                                bool verbose, bool hex) {
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
     virtual void Corruption(size_t bytes, const Status& s) {
@@ -1652,7 +1648,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     builder.SaveTo(v);
 
     // Install recovered version
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
     Finalize(v, size_being_compacted);
 
@@ -1683,7 +1679,7 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
 }
 
 void VersionSet::Finalize(Version* v,
-  std::vector<uint64_t>& size_being_compacted) {
+                          std::vector<uint64_t>& size_being_compacted) {
   // Pre-sort level0 for Get()
   if (options_->compaction_style == kCompactionStyleUniversal) {
     std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
@@ -1696,7 +1692,7 @@ void VersionSet::Finalize(Version* v,
 
   int num_levels_to_check =
       (options_->compaction_style != kCompactionStyleUniversal) ?
-          NumberLevels() - 1 : 1;
+          v->NumberLevels() - 1 : 1;
 
   for (int level = 0; level < num_levels_to_check; level++) {
 
@@ -1757,8 +1753,8 @@ void VersionSet::Finalize(Version* v,
 
   // sort all the levels based on their score. Higher scores get listed
   // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i <  NumberLevels()-2; i++) {
-    for (int j = i+1; j < NumberLevels()-1; j++) {
+  for (int i = 0; i < v->NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < v->NumberLevels() - 1; j++) {
       if (v->compaction_score_[i] < v->compaction_score_[j]) {
         double score = v->compaction_score_[i];
         int level = v->compaction_level_[i];
@@ -1793,8 +1789,9 @@ static bool compareSeqnoDescending(const VersionSet::Fsize& first,
 void VersionSet::UpdateFilesBySize(Version* v) {
 
   // No need to sort the highest level because it is never compacted.
-  int max_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  NumberLevels() : NumberLevels() - 1;
+  int max_level = (options_->compaction_style == kCompactionStyleUniversal)
+                      ? v->NumberLevels()
+                      : v->NumberLevels() - 1;
 
   for (int level = 0; level < max_level; level++) {
 
@@ -1850,7 +1847,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   }
 
   // Save files
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < current_->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       const FileMetaData* f = files[i];
@@ -1864,15 +1861,9 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   return log->AddRecord(record);
 }
 
-int VersionSet::NumLevelFiles(int level) const {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return current_->files_[level].size();
-}
-
 const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < NumberLevels(); i++) {
+  for (int i = 0; i < current_->NumberLevels(); i++) {
     int sz = sizeof(scratch->buffer) - len;
     int ret = snprintf(scratch->buffer + len, sz, "%d ",
         int(current_->files_[i].size()));
@@ -1884,10 +1875,10 @@ const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
   return scratch->buffer;
 }
 
-const char* VersionSet::LevelDataSizeSummary(
-    LevelSummaryStorage* scratch) const {
+const char* VersionSet::LevelDataSizeSummary(LevelSummaryStorage* scratch)
+    const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (int i = 0; i < NumberLevels(); i++) {
+  for (int i = 0; i < current_->NumberLevels(); i++) {
     int sz = sizeof(scratch->buffer) - len;
     int ret = snprintf(scratch->buffer + len, sz, "%lu ",
         (unsigned long)NumLevelBytes(i));
@@ -1950,7 +1941,7 @@ bool VersionSet::ManifestContains(const std::string& record) const {
 
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < v->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = v->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
@@ -1987,7 +1978,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   for (Version* v = dummy_versions_.next_;
        v != &dummy_versions_;
        v = v->next_) {
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       total_files += v->files_[level].size();
     }
   }
@@ -1998,7 +1989,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   for (Version* v = dummy_versions_.next_;
        v != &dummy_versions_;
        v = v->next_) {
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       for (const auto& f : v->files_[level]) {
         live_list->push_back(f->number);
       }
@@ -2008,7 +1999,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
 
 void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
   Version* v = current_;
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < v->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = v->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       live->insert(files[i]->number);
@@ -2018,7 +2009,7 @@ void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
 
 int64_t VersionSet::NumLevelBytes(int level) const {
   assert(level >= 0);
-  assert(level < NumberLevels());
+  assert(level < current_->NumberLevels());
   assert(current_);
   return TotalFileSize(current_->files_[level]);
 }
@@ -2026,7 +2017,7 @@ int64_t VersionSet::NumLevelBytes(int level) const {
 int64_t VersionSet::MaxNextLevelOverlappingBytes() {
   uint64_t result = 0;
   std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < NumberLevels() - 1; level++) {
+  for (int level = 1; level < current_->NumberLevels() - 1; level++) {
     for (size_t i = 0; i < current_->files_[level].size(); i++) {
       const FileMetaData* f = current_->files_[level][i];
       current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
@@ -2200,7 +2191,7 @@ void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) {
 // The total size of files that are currently being compacted
 // at at every level upto the penultimate level.
 void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
-  for (int level = 0; level < NumberLevels()-1; level++) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
     uint64_t total = 0;
     for (std::set<Compaction*>::iterator it =
          compactions_in_progress_[level].begin();
@@ -2223,8 +2214,8 @@ void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
 // base file (overrides configured values of file-size ratios,
 // min_merge_width and max_merge_width).
 //
-Compaction* VersionSet::PickCompactionUniversalSizeAmp(
-    int level, double score) {
+Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level,
+                                                       double score) {
   assert (level == 0);
 
   // percentage flexibilty while reducing size amplification
@@ -2306,13 +2297,13 @@ Compaction* VersionSet::PickCompactionUniversalSizeAmp(
 
   // create a compaction request
   // We always compact all the files, so always compress.
-  Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
-                                 LLONG_MAX, NumberLevels(), false,
-                                 true);
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
   c->score_ = score;
   for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
     int index = file_by_time[loop];
-    f = current_->files_[level][index];
+    f = c->input_version_->files_[level][index];
     c->inputs_[0].push_back(f);
     Log(options_->info_log,
         "Universal: size amp picking file %lu[%d] with size %lu",
@@ -2436,14 +2427,14 @@ Compaction* VersionSet::PickCompactionUniversalReadAmp(
       }
     }
   }
-  Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
-                                 LLONG_MAX, NumberLevels(), false,
-                                 enable_compression);
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
   c->score_ = score;
 
   for (unsigned int i = start_index; i < first_index_after; i++) {
     int index = file_by_time[i];
-    FileMetaData* f = current_->files_[level][index];
+    FileMetaData* f = c->input_version_->files_[level][index];
     c->inputs_[0].push_back(f);
     Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
         (unsigned long)f->number,
@@ -2505,11 +2496,11 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
   }
 
   // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = current_->files_by_size_[level];
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
 
   // Is the earliest file part of this compaction?
   int last_index = file_by_time[file_by_time.size()-1];
-  FileMetaData* last_file = current_->files_[level][last_index];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
   if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
     c->bottommost_level_ = true;
   }
@@ -2520,9 +2511,6 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
                                       c->inputs_[0].size());
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
-
   // mark all the files that are being compacted
   c->MarkFilesBeingCompacted(true);
 
@@ -2531,7 +2519,8 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
 
   // Record whether this compaction includes all sst files.
   // For now, it is only relevant in universal compaction mode.
-  c->is_full_compaction_ = (c->inputs_[0].size() == current_->files_[0].size());
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
 
   return c;
 }
@@ -2548,27 +2537,28 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) {
   }
 
   assert(level >= 0);
-  assert(level+1 < NumberLevels());
-  c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
-      MaxGrandParentOverlapBytes(level), NumberLevels());
+  assert(level + 1 < current_->NumberLevels());
+  c = new Compaction(current_, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
   c->score_ = score;
 
   // Pick the largest file in this level that is not already
   // being compacted
-  std::vector<int>& file_size = current_->files_by_size_[level];
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
 
   // record the first file that is not yet compacted
   int nextIndex = -1;
 
-  for (unsigned int i = current_->next_file_to_compact_by_size_[level];
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
        i < file_size.size(); i++) {
     int index = file_size[i];
-    FileMetaData* f = current_->files_[level][index];
+    FileMetaData* f = c->input_version_->files_[level][index];
 
     // check to verify files are arranged in descending size
     assert((i == file_size.size() - 1) ||
-           (i >= Version::number_of_files_to_sort_-1) ||
-          (f->file_size >= current_->files_[level][file_size[i+1]]->file_size));
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
 
     // do not pick a file to compact if it is being compacted
     // from n-1 level.
@@ -2604,7 +2594,7 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) {
   }
 
   // store where to start the iteration in the next call to PickCompaction
-  current_->next_file_to_compact_by_size_[level] = nextIndex;
+  c->input_version_->next_file_to_compact_by_size_[level] = nextIndex;
 
   return c;
 }
@@ -2655,11 +2645,12 @@ Compaction* VersionSet::PickCompaction() {
     if (level != 0 || compactions_in_progress_[0].empty()) {
       if(!ParentRangeInCompaction(&f->smallest, &f->largest, level,
                                   &parent_index)) {
-        c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
-                MaxGrandParentOverlapBytes(level), NumberLevels(), true);
+        c = new Compaction(current_, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
         c->inputs_[0].push_back(f);
         c->parent_index_ = parent_index;
-        current_->file_to_compact_ = nullptr;
+        c->input_version_->file_to_compact_ = nullptr;
         ExpandWhileOverlapping(c);
       }
     }
@@ -2669,9 +2660,6 @@ Compaction* VersionSet::PickCompaction() {
     return nullptr;
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
-
   // Two level 0 compaction won't run at the same time, so don't need to worry
   // about files on level 0 being compacted.
   if (level == 0) {
@@ -2682,7 +2670,8 @@ Compaction* VersionSet::PickCompaction() {
     // c->inputs_[0] earlier and replace it with an overlapping set
     // which will include the picked file.
     c->inputs_[0].clear();
-    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
 
     // If we include more L0 files in the same compaction run it can
     // cause the 'smallest' and 'largest' key to get extended to a
@@ -2713,12 +2702,13 @@ Compaction* VersionSet::PickCompaction() {
 
 // Returns true if any one of the parent files are being compacted
 bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
-  const InternalKey* largest, int level, int* parent_index) {
+                                         const InternalKey* largest, int level,
+                                         int* parent_index) {
   std::vector<FileMetaData*> inputs;
-  assert(level + 1 < NumberLevels());
+  assert(level + 1 < current_->NumberLevels());
 
-  current_->GetOverlappingInputs(level+1, smallest, largest,
-                                 &inputs, *parent_index, parent_index);
+  current_->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                 *parent_index, parent_index);
   return FilesInCompaction(inputs);
 }
 
@@ -2766,8 +2756,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
     old_size = c->inputs_[0].size();
     GetRange(c->inputs_[0], &smallest, &largest);
     c->inputs_[0].clear();
-    current_->GetOverlappingInputs(level, &smallest, &largest, &c->inputs_[0],
-                                   hint_index, &hint_index);
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
   } while(c->inputs_[0].size() > old_size);
 
   // Get the new range
@@ -2805,8 +2795,9 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   GetRange(c->inputs_[0], &smallest, &largest);
 
   // Populate the set of next-level files (inputs_[1]) to include in compaction
-  current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1],
-                                 c->parent_index_, &c->parent_index_);
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
 
   // Get entire range covered by compaction
   InternalKey all_start, all_limit;
@@ -2819,8 +2810,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   // can happen when one user key spans multiple files.
   if (!c->inputs_[1].empty()) {
     std::vector<FileMetaData*> expanded0;
-    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0,
-                                   c->base_index_, nullptr);
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
     const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
     const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
     const uint64_t expanded0_size = TotalFileSize(expanded0);
@@ -2828,13 +2819,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
     if (expanded0.size() > c->inputs_[0].size() &&
         inputs1_size + expanded0_size < limit &&
         !FilesInCompaction(expanded0) &&
-        !current_->HasOverlappingUserKey(&expanded0, level)) {
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
       InternalKey new_start, new_limit;
       GetRange(expanded0, &new_start, &new_limit);
       std::vector<FileMetaData*> expanded1;
-      current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
-                                     &expanded1, c->parent_index_,
-                                     &c->parent_index_);
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
       if (expanded1.size() == c->inputs_[1].size() &&
           !FilesInCompaction(expanded1)) {
         Log(options_->info_log,
@@ -2861,8 +2852,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   // Compute the set of grandparent files that overlap this compaction
   // (parent == level+1; grandparent == level+2)
   if (level + 2 < NumberLevels()) {
-    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
-                                   &c->grandparents_);
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
   }
 
   if (false) {
@@ -2880,10 +2871,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   c->edit_->SetCompactPointer(level, largest);
 }
 
-Status VersionSet::GetMetadataForFile(
-    uint64_t number,
-    int *filelevel,
-    FileMetaData *meta) {
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData* meta) {
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
@@ -2897,8 +2886,7 @@ Status VersionSet::GetMetadataForFile(
   return Status::NotFound("File not present in any level");
 }
 
-void VersionSet::GetLiveFilesMetaData(
-    std::vector<LiveFileMetaData> * metadata) {
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
@@ -2960,11 +2948,9 @@ Compaction* VersionSet::CompactRange(int input_level,
       }
     }
   }
-  Compaction* c = new Compaction(input_level,
-                                 output_level,
+  Compaction* c = new Compaction(current_, input_level, output_level,
                                  MaxFileSizeForLevel(output_level),
-                                 MaxGrandParentOverlapBytes(input_level),
-                                 NumberLevels());
+                                 MaxGrandParentOverlapBytes(input_level));
 
   c->inputs_[0] = inputs;
   ExpandWhileOverlapping(c);
@@ -2973,8 +2959,6 @@ Compaction* VersionSet::CompactRange(int input_level,
     return nullptr;
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
   SetupOtherInputs(c);
 
   if (covering_the_whole_range) {
@@ -2991,15 +2975,16 @@ Compaction* VersionSet::CompactRange(int input_level,
   return c;
 }
 
-Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
-  uint64_t max_grandparent_overlap_bytes, int number_levels,
-  bool seek_compaction, bool enable_compression)
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
     : level_(level),
       out_level_(out_level),
       max_output_file_size_(target_file_size),
       maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
-      input_version_(nullptr),
-      number_levels_(number_levels),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
       seek_compaction_(seek_compaction),
       enable_compression_(enable_compression),
       grandparent_index_(0),
@@ -3010,7 +2995,9 @@ Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
       score_(0),
       bottommost_level_(false),
       is_full_compaction_(false),
-      level_ptrs_(std::vector<size_t>(number_levels)) {
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  input_version_->Ref();
   edit_ = new VersionEdit();
   for (int i = 0; i < number_levels_; i++) {
     level_ptrs_[i] = 0;
@@ -3125,7 +3112,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) {
   bottommost_level_ = true;
   int num_levels = input_version_->vset_->NumberLevels();
   for (int i = output_level() + 1; i < num_levels; i++) {
-    if (input_version_->vset_->NumLevelFiles(i) > 0) {
+    if (input_version_->NumLevelFiles(i) > 0) {
       bottommost_level_ = false;
       break;
     }
@@ -3143,9 +3130,8 @@ void Compaction::ResetNextCompactionIndex() {
   input_version_->ResetNextCompactionIndex(level_);
 }
 
-static void InputSummary(std::vector<FileMetaData*>& files,
-    char* output,
-    int len) {
+static void InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
   int write = 0;
   for (unsigned int i = 0; i < files.size(); i++) {
     int sz = len - write;
diff --git a/db/version_set.h b/db/version_set.h
index 2c91532b5..68c41b160 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -135,7 +135,10 @@ class Version {
   int PickLevelForMemTableOutput(const Slice& smallest_user_key,
                                  const Slice& largest_user_key);
 
-  int NumFiles(int level) const { return files_[level].size(); }
+  int NumberLevels() const { return num_levels_; }
+
+  // REQUIRES: lock is held
+  int NumLevelFiles(int level) const { return files_[level].size(); }
 
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
@@ -161,6 +164,7 @@ class Version {
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
+  int num_levels_;              // Number of levels
 
   // List of files per level, files in each level are arranged
   // in increasing order of keys
@@ -197,9 +201,6 @@ class Version {
   double max_compaction_score_; // max score in l1 to ln-1
   int max_compaction_score_level_; // level on which max score occurs
 
-  // The offset in the manifest file where this version is stored.
-  uint64_t offset_manifest_file_;
-
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
   uint64_t version_number_;
@@ -234,7 +235,7 @@ class VersionSet {
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
   Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
-      bool new_descriptor_log = false);
+                     bool new_descriptor_log = false);
 
   // Recover the last saved descriptor from persistent storage.
   Status Recover();
@@ -271,9 +272,6 @@ class VersionSet {
     }
   }
 
-  // Return the number of Table files at the specified level.
-  int NumLevelFiles(int level) const;
-
   // Return the combined file size of all files at the specified level.
   int64_t NumLevelBytes(int level) const;
 
@@ -400,7 +398,7 @@ class VersionSet {
   const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
 
   // Return the size of the current manifest file
-  const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
+  uint64_t ManifestFileSize() const { return manifest_file_size_; }
 
   // For the specfied level, pick a compaction.
   // Returns nullptr if there is no compaction to be done.
@@ -524,9 +522,8 @@ class VersionSet {
   // Queue of writers to the manifest file
   std::deque<ManifestWriter*> manifest_writers_;
 
-  // Store the manifest file size when it is checked.
-  // Save us the cost of checking file size twice in LogAndApply
-  uint64_t last_observed_manifest_size_;
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
 
   std::vector<FileMetaData*> obsolete_files_;
 
@@ -619,9 +616,9 @@ class Compaction {
   friend class Version;
   friend class VersionSet;
 
-  explicit Compaction(int level, int out_level, uint64_t target_file_size,
-    uint64_t max_grandparent_overlap_bytes, int number_levels,
-    bool seek_compaction = false, bool enable_compression = true);
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
 
   int level_;
   int out_level_; // levels to which output files are stored
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index 07062399b..2ca689809 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -25,7 +25,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   }
 
   Version* current_version = current_;
-  int current_levels = NumberLevels();
+  int current_levels = current_version->NumberLevels();
 
   if (current_levels <= new_levels) {
     return Status::OK();
@@ -36,7 +36,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   int first_nonempty_level = -1;
   int first_nonempty_level_filenum = 0;
   for (int i = new_levels - 1; i < current_levels; i++) {
-    int file_num = NumLevelFiles(i);
+    int file_num = current_version->NumLevelFiles(i);
     if (file_num != 0) {
       if (first_nonempty_level < 0) {
         first_nonempty_level = i;
@@ -65,6 +65,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
 
   delete[] current_version->files_;
   current_version->files_ = new_files_list;
+  current_version->num_levels_ = new_levels;
 
   delete[] compact_pointer_;
   delete[] max_file_size_;
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 58d81460e..65ecd61a2 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1024,7 +1024,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   }
   int max = -1;
   for (int i = 0; i < versions.NumberLevels(); i++) {
-    if (versions.NumLevelFiles(i)) {
+    if (versions.current()->NumLevelFiles(i)) {
       max = i;
     }
   }

From 2f4eda78906e5922c519f3ba49e7a3fe1bdd1403 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:18:04 -0800
Subject: [PATCH 17/27] Move functions from VersionSet to Version

Summary:
There were some functions in VersionSet that had no reason to be there instead of Version. Moving them to Version will make column families implementation easier.

The functions moved are:
* NumLevelBytes
* LevelSummary
* LevelFileSummary
* MaxNextLevelOverlappingBytes
* AddLiveFiles (previously AddLiveFilesCurrentVersion())
* NeedSlowdownForNumLevel0Files

The diff continues on (and depends on) D15171

Test Plan: make check

Reviewers: dhruba, haobo, kailiu, sdong, emayanke

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15183
---
 db/db_filesnapshot.cc |   2 +-
 db/db_impl.cc         |  50 +++++++-------
 db/db_impl.h          |   2 +-
 db/db_stats_logger.cc |   6 +-
 db/version_set.cc     | 152 ++++++++++++++++++------------------------
 db/version_set.h      |  67 +++++++++----------
 6 files changed, 125 insertions(+), 154 deletions(-)

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index a7232246a..04d6d0e17 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -74,7 +74,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   // Make a set of all of the live *.sst files
   std::set<uint64_t> live;
-  versions_->AddLiveFilesCurrentVersion(&live);
+  versions_->current()->AddLiveFiles(&live);
 
   ret.clear();
   ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
diff --git a/db/db_impl.cc b/db/db_impl.cc
index cffcbdfef..e84817b9b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1313,13 +1313,13 @@ void DBImpl::CompactRange(const Slice* begin,
 // return the same level if it cannot be moved
 int DBImpl::FindMinimumEmptyLevelFitting(int level) {
   mutex_.AssertHeld();
+  Version* current = versions_->current();
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (versions_->current()->NumLevelFiles(i) > 0) break;
-
+    if (current->NumLevelFiles(i) > 0) break;
     // stop if level i is too small (cannot fit the level files)
-    if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break;
+    if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break;
 
     minimum_level = i;
   }
@@ -1826,6 +1826,11 @@ void DBImpl::TEST_PurgeObsoleteteWAL() {
   PurgeObsoleteWALFiles();
 }
 
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  MutexLock l(&mutex_);
+  return versions_->current()->NumLevelBytes(0);
+}
+
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
   DeletionState deletion_state(options_.max_write_buffer_number, true);
@@ -1939,13 +1944,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
                        f->smallest_seqno, f->largest_seqno);
     status = versions_->LogAndApply(c->edit(), &mutex_);
     InstallSuperVersion(deletion_state);
-    VersionSet::LevelSummaryStorage tmp;
+    Version::LevelSummaryStorage tmp;
     Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
-        static_cast<unsigned long long>(f->number),
-        c->level() + 1,
+        static_cast<unsigned long long>(f->number), c->level() + 1,
         static_cast<unsigned long long>(f->file_size),
-        status.ToString().c_str(),
-        versions_->LevelSummary(&tmp));
+        status.ToString().c_str(), versions_->current()->LevelSummary(&tmp));
     versions_->ReleaseCompactionFiles(c.get(), status);
     *madeProgress = true;
   } else {
@@ -2605,22 +2608,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     status = InstallCompactionResults(compact);
     InstallSuperVersion(deletion_state);
   }
-  VersionSet::LevelSummaryStorage tmp;
+  Version::LevelSummaryStorage tmp;
   Log(options_.info_log,
       "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
       "write-amplify(%.1f) %s\n",
-      versions_->LevelSummary(&tmp),
+      versions_->current()->LevelSummary(&tmp),
       (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
-          (double) stats.micros,
-      compact->compaction->output_level(),
-      stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1,
-      stats.bytes_readn / 1048576.0,
-      stats.bytes_readnp1 / 1048576.0,
+          (double)stats.micros,
+      compact->compaction->output_level(), stats.files_in_leveln,
+      stats.files_in_levelnp1, stats.files_out_levelnp1,
+      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
       stats.bytes_written / 1048576.0,
       (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-          (double) stats.bytes_readn,
-      stats.bytes_written / (double) stats.bytes_readn,
+          (double)stats.bytes_readn,
+      stats.bytes_written / (double)stats.bytes_readn,
       status.ToString().c_str());
 
   return status;
@@ -2701,7 +2703,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
   MutexLock l(&mutex_);
-  return versions_->MaxNextLevelOverlappingBytes();
+  return versions_->current()->MaxNextLevelOverlappingBytes();
 }
 
 Status DBImpl::Get(const ReadOptions& options,
@@ -3193,9 +3195,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       // Yield previous error
       s = bg_error_;
       break;
-    } else if (
-        allow_delay &&
-        versions_->NeedSlowdownForNumLevel0Files()) {
+    } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
       // We are getting close to hitting a hard limit on the number of
       // L0 files.  Rather than delaying a single write by several
       // seconds when we hit the hard limit, start delaying each
@@ -3403,7 +3403,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
                "%3d %8d %8.0f\n",
                level,
                current->NumLevelFiles(level),
-               versions_->NumLevelBytes(level) / 1048576.0);
+               current->NumLevelBytes(level) / 1048576.0);
       value->append(buf);
     }
     return true;
@@ -3446,7 +3446,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
              "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"
              );
     value->append(buf);
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < current->NumberLevels(); level++) {
       int files = current->NumLevelFiles(level);
       if (stats_[level].micros > 0 || files > 0) {
         int64_t bytes_read = stats_[level].bytes_readn +
@@ -3468,8 +3468,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
             "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n",
             level,
             files,
-            versions_->NumLevelBytes(level) / 1048576.0,
-            versions_->NumLevelBytes(level) /
+            current->NumLevelBytes(level) / 1048576.0,
+            current->NumLevelBytes(level) /
                 versions_->MaxBytesForLevel(level),
             stats_[level].micros / 1e6,
             bytes_read / 1048576.0,
diff --git a/db/db_impl.h b/db/db_impl.h
index 476b2bf54..214affac7 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -129,7 +129,7 @@ class DBImpl : public DB {
   void TEST_PurgeObsoleteteWAL();
 
   // get total level0 file size. Only for testing.
-  uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}
+  uint64_t TEST_GetLevel0TotalSize();
 
   void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
   {
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
index 0fd6dd805..db86865ca 100644
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@@ -68,11 +68,11 @@ void DBImpl::LogDBDeployStats() {
   Version* current = versions_->current();
   for (int i = 0; i < current->NumberLevels(); i++) {
     file_total_num += current->NumLevelFiles(i);
-    file_total_size += versions_->NumLevelBytes(i);
+    file_total_size += current->NumLevelBytes(i);
   }
 
-  VersionSet::LevelSummaryStorage scratch;
-  const char* file_num_summary = versions_->LevelSummary(&scratch);
+  Version::LevelSummaryStorage scratch;
+  const char* file_num_summary = current->LevelSummary(&scratch);
   std::string file_num_per_level(file_num_summary);
   std::string data_size_per_level(file_num_summary);
 
diff --git a/db/version_set.cc b/db/version_set.cc
index b4c1b2233..eb20650ba 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -857,6 +857,67 @@ bool Version::HasOverlappingUserKey(
   return false;
 }
 
+int64_t Version::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
+  for (int i = 0; i < NumberLevels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
+                                      int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%lu(seq=%lu,sz=%lu,%lu) ",
+                       (unsigned long)f->number,
+                       (unsigned long)f->smallest_seqno,
+                       (unsigned long)f->file_size,
+                       (unsigned long)f->being_compacted);
+    if (ret < 0 || ret >= sz)
+      break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+int64_t Version::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < NumberLevels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+void Version::AddLiveFiles(std::set<uint64_t>* live) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (const auto& file : files) {
+      live->insert(file->number);
+    }
+  }
+}
+
 std::string Version::DebugString(bool hex) const {
   std::string r;
   for (int level = 0; level < num_levels_; level++) {
@@ -1145,7 +1206,7 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
       num_levels_(options_->num_levels),
       dummy_versions_(this),
       current_(nullptr),
-      need_slowdown_for_num_level0_files(false),
+      need_slowdown_for_num_level0_files_(false),
       compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
       manifest_file_size_(0),
@@ -1197,7 +1258,7 @@ void VersionSet::AppendVersion(Version* v) {
     current_->Unref();
   }
   current_ = v;
-  need_slowdown_for_num_level0_files =
+  need_slowdown_for_num_level0_files_ =
       (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
        v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
@@ -1861,55 +1922,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   return log->AddRecord(record);
 }
 
-const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < current_->NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%d ",
-        int(current_->files_[i].size()));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelDataSizeSummary(LevelSummaryStorage* scratch)
-    const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (int i = 0; i < current_->NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%lu ",
-        (unsigned long)NumLevelBytes(i));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelFileSummary(
-    FileSummaryStorage* scratch, int level) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (unsigned int i = 0; i < current_->files_[level].size(); i++) {
-    FileMetaData* f = current_->files_[level][i];
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz,
-                       "#%lu(seq=%lu,sz=%lu,%lu) ",
-                       (unsigned long)f->number,
-                       (unsigned long)f->smallest_seqno,
-                       (unsigned long)f->file_size,
-                       (unsigned long)f->being_compacted);
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
 // Opens the mainfest file and reads all records
 // till it finds the record we are looking for.
 bool VersionSet::ManifestContains(const std::string& record) const {
@@ -1997,40 +2009,6 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   }
 }
 
-void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
-  Version* v = current_;
-  for (int level = 0; level < v->NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    for (size_t i = 0; i < files.size(); i++) {
-      live->insert(files[i]->number);
-    }
-  }
-}
-
-int64_t VersionSet::NumLevelBytes(int level) const {
-  assert(level >= 0);
-  assert(level < current_->NumberLevels());
-  assert(current_);
-  return TotalFileSize(current_->files_[level]);
-}
-
-int64_t VersionSet::MaxNextLevelOverlappingBytes() {
-  uint64_t result = 0;
-  std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < current_->NumberLevels() - 1; level++) {
-    for (size_t i = 0; i < current_->files_[level].size(); i++) {
-      const FileMetaData* f = current_->files_[level][i];
-      current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
-                                     &overlaps);
-      const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > result) {
-        result = sum;
-      }
-    }
-  }
-  return result;
-}
-
 // Stores the minimal range that covers all entries in inputs in
 // *smallest, *largest.
 // REQUIRES: inputs is not empty
@@ -2456,10 +2434,10 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
     Log(options_->info_log, "Universal: nothing to do\n");
     return nullptr;
   }
-  VersionSet::FileSummaryStorage tmp;
+  Version::FileSummaryStorage tmp;
   Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
       current_->files_[level].size(),
-      LevelFileSummary(&tmp, 0));
+      current_->LevelFileSummary(&tmp, 0));
 
   // Check for size amplification first.
   Compaction* c = PickCompactionUniversalSizeAmp(level, score);
diff --git a/db/version_set.h b/db/version_set.h
index 68c41b160..51f6d9b6c 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -140,13 +140,34 @@ class Version {
   // REQUIRES: lock is held
   int NumLevelFiles(int level) const { return files_[level].size(); }
 
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::set<uint64_t>* live);
+
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
 
   // Returns the version nuber of this version
-  uint64_t GetVersionNumber() {
-    return version_number_;
-  }
+  uint64_t GetVersionNumber() const { return version_number_; }
 
  private:
   friend class Compaction;
@@ -222,10 +243,8 @@ class Version {
 
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname,
-             const Options* options,
-             const EnvOptions& storage_options,
-             TableCache* table_cache,
+  VersionSet(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, TableCache* table_cache,
              const InternalKeyComparator*);
   ~VersionSet();
 
@@ -254,7 +273,7 @@ class VersionSet {
   // A Flag indicating whether write needs to slowdown because of there are
   // too many number of level0 files.
   bool NeedSlowdownForNumLevel0Files() const {
-    return need_slowdown_for_num_level0_files;
+    return need_slowdown_for_num_level0_files_;
   }
 
   // Return the current manifest file number
@@ -272,9 +291,6 @@ class VersionSet {
     }
   }
 
-  // Return the combined file size of all files at the specified level.
-  int64_t NumLevelBytes(int level) const;
-
   // Return the last sequence number.
   uint64_t LastSequence() const {
     return last_sequence_.load(std::memory_order_acquire);
@@ -321,10 +337,6 @@ class VersionSet {
                            const InternalKey* end,
                            InternalKey** compaction_end);
 
-  // Return the maximum overlapping data (in bytes) at next level for any
-  // file at a level >= 1.
-  int64_t MaxNextLevelOverlappingBytes();
-
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
   Iterator* MakeInputIterator(Compaction* c);
@@ -368,35 +380,14 @@ class VersionSet {
   // Add all files listed in any live version to *live.
   void AddLiveFiles(std::vector<uint64_t>* live_list);
 
-  // Add all files listed in the current version to *live.
-  void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
-
   // Return the approximate offset in the database of the data for
   // "key" as of version "v".
   uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
 
-  // Return a human-readable short (single-line) summary of the number
-  // of files per level.  Uses *scratch as backing store.
-  struct LevelSummaryStorage {
-    char buffer[100];
-  };
-  struct FileSummaryStorage {
-    char buffer[1000];
-  };
-  const char* LevelSummary(LevelSummaryStorage* scratch) const;
-
   // printf contents (for debugging)
   Status DumpManifest(Options& options, std::string& manifestFileName,
                       bool verbose, bool hex = false);
 
-  // Return a human-readable short (single-line) summary of the data size
-  // of files per level.  Uses *scratch as backing store.
-  const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
-
-  // Return a human-readable short (single-line) summary of files
-  // in a specified level.  Uses *scratch as backing store.
-  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
-
   // Return the size of the current manifest file
   uint64_t ManifestFileSize() const { return manifest_file_size_; }
 
@@ -501,7 +492,9 @@ class VersionSet {
   Version dummy_versions_;  // Head of circular doubly-linked list of versions.
   Version* current_;        // == dummy_versions_.prev_
 
-  bool need_slowdown_for_num_level0_files;
+  // A flag indicating whether we should delay writes because
+  // we have too many level 0 files
+  bool need_slowdown_for_num_level0_files_;
 
   // Per-level key at which the next compaction at that level should start.
   // Either an empty string, or a valid InternalKey.

From 615d1ea2f48ee7ca730cee8f06778e2f06dd0fbd Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:22:34 -0800
Subject: [PATCH 18/27] Moving Compaction class to separate header file

Summary:
I'm sure we'll all agree that version_set.cc needs simplifying. This diff moves Compaction class to a separate file.

The diff depends on D15171 and D15183

Test Plan: make check

Reviewers: dhruba, haobo, kailiu, sdong

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15189
---
 db/compaction.cc  | 214 ++++++++++++++++++++++++++++++++++++++++++++++
 db/compaction.h   | 131 ++++++++++++++++++++++++++++
 db/version_set.cc | 193 +----------------------------------------
 db/version_set.h  | 115 +------------------------
 4 files changed, 347 insertions(+), 306 deletions(-)
 create mode 100644 db/compaction.cc
 create mode 100644 db/compaction.h

diff --git a/db/compaction.cc b/db/compaction.cc
new file mode 100644
index 000000000..703e7aeae
--- /dev/null
+++ b/db/compaction.cc
@@ -0,0 +1,214 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
+    : level_(level),
+      out_level_(out_level),
+      max_output_file_size_(target_file_size),
+      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
+      seek_compaction_(seek_compaction),
+      enable_compression_(enable_compression),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  delete edit_;
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  if (input_version_->vset_->options_->compaction_style ==
+      kCompactionStyleUniversal) {
+    return bottommost_level_;
+  }
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    assert(grandparent_index_ + 1 >= grandparents_.size() ||
+           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
+                         grandparents_[grandparent_index_+1]->smallest.Encode())
+                         < 0);
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool value) {
+  for (int i = 0; i < 2; i++) {
+    std::vector<FileMetaData*> v = inputs_[i];
+    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
+      assert(value ? !inputs_[i][j]->being_compacted :
+                      inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = value;
+    }
+  }
+}
+
+// Is this compaction producing files at the bottommost level?
+void Compaction::SetupBottomMostLevel(bool isManual) {
+  if (input_version_->vset_->options_->compaction_style  ==
+         kCompactionStyleUniversal) {
+    // If universal compaction style is used and manual
+    // compaction is occuring, then we are guaranteed that
+    // all files will be picked in a single compaction
+    // run. We can safely set bottommost_level_ = true.
+    // If it is not manual compaction, then bottommost_level_
+    // is already set when the Compaction was created.
+    if (isManual) {
+      bottommost_level_ = true;
+    }
+    return;
+  }
+  bottommost_level_ = true;
+  int num_levels = input_version_->vset_->NumberLevels();
+  for (int i = output_level() + 1; i < num_levels; i++) {
+    if (input_version_->NumLevelFiles(i) > 0) {
+      bottommost_level_ = false;
+      break;
+    }
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+    input_version_ = nullptr;
+  }
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  input_version_->ResetNextCompactionIndex(level_);
+}
+
+static void InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
+  int write = 0;
+  for (unsigned int i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret = snprintf(output + write, sz, "%lu(%lu) ",
+        (unsigned long)files.at(i)->number,
+        (unsigned long)files.at(i)->file_size);
+    if (ret < 0 || ret >= sz)
+      break;
+    write += ret;
+  }
+}
+
+void Compaction::Summary(char* output, int len) {
+  int write = snprintf(output, len,
+      "Base version %lu Base level %d, seek compaction:%d, inputs:",
+      (unsigned long)input_version_->GetVersionNumber(),
+      level_,
+      seek_compaction_);
+  if (write < 0 || write > len) {
+    return;
+  }
+
+  char level_low_summary[100];
+  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
+  char level_up_summary[100];
+  if (inputs_[1].size()) {
+    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
+  } else {
+    level_up_summary[0] = '\0';
+  }
+
+  snprintf(output + write, len - write, "[%s],[%s]",
+      level_low_summary, level_up_summary);
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction.h b/db/compaction.h
new file mode 100644
index 000000000..4cc0197da
--- /dev/null
+++ b/db/compaction.h
@@ -0,0 +1,131 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+class Version;
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t maxGrandParentOverlapBytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index eb20650ba..05e7c7053 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -18,6 +18,7 @@
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
@@ -2953,196 +2954,4 @@ Compaction* VersionSet::CompactRange(int input_level,
   return c;
 }
 
-Compaction::Compaction(Version* input_version, int level, int out_level,
-                       uint64_t target_file_size,
-                       uint64_t max_grandparent_overlap_bytes,
-                       bool seek_compaction, bool enable_compression)
-    : level_(level),
-      out_level_(out_level),
-      max_output_file_size_(target_file_size),
-      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
-      input_version_(input_version),
-      number_levels_(input_version_->NumberLevels()),
-      seek_compaction_(seek_compaction),
-      enable_compression_(enable_compression),
-      grandparent_index_(0),
-      seen_key_(false),
-      overlapped_bytes_(0),
-      base_index_(-1),
-      parent_index_(-1),
-      score_(0),
-      bottommost_level_(false),
-      is_full_compaction_(false),
-      level_ptrs_(std::vector<size_t>(number_levels_)) {
-
-  input_version_->Ref();
-  edit_ = new VersionEdit();
-  for (int i = 0; i < number_levels_; i++) {
-    level_ptrs_[i] = 0;
-  }
-}
-
-Compaction::~Compaction() {
-  delete edit_;
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-  }
-}
-
-bool Compaction::IsTrivialMove() const {
-  // Avoid a move if there is lots of overlapping grandparent data.
-  // Otherwise, the move could create a parent file that will require
-  // a very expensive merge later on.
-  // If level_== out_level_, the purpose is to force compaction filter to be
-  // applied to that level, and thus cannot be a trivia move.
-  return (level_ != out_level_ &&
-          num_input_files(0) == 1 &&
-          num_input_files(1) == 0 &&
-          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
-}
-
-void Compaction::AddInputDeletions(VersionEdit* edit) {
-  for (int which = 0; which < 2; which++) {
-    for (size_t i = 0; i < inputs_[which].size(); i++) {
-      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
-    }
-  }
-}
-
-bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
-  if (input_version_->vset_->options_->compaction_style ==
-      kCompactionStyleUniversal) {
-    return bottommost_level_;
-  }
-  // Maybe use binary search to find right entry instead of linear search?
-  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
-  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
-    for (; level_ptrs_[lvl] < files.size(); ) {
-      FileMetaData* f = files[level_ptrs_[lvl]];
-      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
-        // We've advanced far enough
-        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
-          // Key falls in this file's range, so definitely not base level
-          return false;
-        }
-        break;
-      }
-      level_ptrs_[lvl]++;
-    }
-  }
-  return true;
-}
-
-bool Compaction::ShouldStopBefore(const Slice& internal_key) {
-  // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
-  while (grandparent_index_ < grandparents_.size() &&
-      icmp->Compare(internal_key,
-                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
-    if (seen_key_) {
-      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
-    }
-    assert(grandparent_index_ + 1 >= grandparents_.size() ||
-           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
-                         grandparents_[grandparent_index_+1]->smallest.Encode())
-                         < 0);
-    grandparent_index_++;
-  }
-  seen_key_ = true;
-
-  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
-    // Too much overlap for current output; start new output
-    overlapped_bytes_ = 0;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// Mark (or clear) each file that is being compacted
-void Compaction::MarkFilesBeingCompacted(bool value) {
-  for (int i = 0; i < 2; i++) {
-    std::vector<FileMetaData*> v = inputs_[i];
-    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
-      assert(value ? !inputs_[i][j]->being_compacted :
-                      inputs_[i][j]->being_compacted);
-      inputs_[i][j]->being_compacted = value;
-    }
-  }
-}
-
-// Is this compaction producing files at the bottommost level?
-void Compaction::SetupBottomMostLevel(bool isManual) {
-  if (input_version_->vset_->options_->compaction_style  ==
-         kCompactionStyleUniversal) {
-    // If universal compaction style is used and manual
-    // compaction is occuring, then we are guaranteed that
-    // all files will be picked in a single compaction
-    // run. We can safely set bottommost_level_ = true.
-    // If it is not manual compaction, then bottommost_level_
-    // is already set when the Compaction was created.
-    if (isManual) {
-      bottommost_level_ = true;
-    }
-    return;
-  }
-  bottommost_level_ = true;
-  int num_levels = input_version_->vset_->NumberLevels();
-  for (int i = output_level() + 1; i < num_levels; i++) {
-    if (input_version_->NumLevelFiles(i) > 0) {
-      bottommost_level_ = false;
-      break;
-    }
-  }
-}
-
-void Compaction::ReleaseInputs() {
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-    input_version_ = nullptr;
-  }
-}
-
-void Compaction::ResetNextCompactionIndex() {
-  input_version_->ResetNextCompactionIndex(level_);
-}
-
-static void InputSummary(std::vector<FileMetaData*>& files, char* output,
-                         int len) {
-  int write = 0;
-  for (unsigned int i = 0; i < files.size(); i++) {
-    int sz = len - write;
-    int ret = snprintf(output + write, sz, "%lu(%lu) ",
-        (unsigned long)files.at(i)->number,
-        (unsigned long)files.at(i)->file_size);
-    if (ret < 0 || ret >= sz)
-      break;
-    write += ret;
-  }
-}
-
-void Compaction::Summary(char* output, int len) {
-  int write = snprintf(output, len,
-      "Base version %lu Base level %d, seek compaction:%d, inputs:",
-      (unsigned long)input_version_->GetVersionNumber(),
-      level_,
-      seek_compaction_);
-  if (write < 0 || write > len) {
-    return;
-  }
-
-  char level_low_summary[100];
-  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
-  char level_up_summary[100];
-  if (inputs_[1].size()) {
-    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
-  } else {
-    level_up_summary[0] = '\0';
-  }
-
-  snprintf(output + write, len - write, "[%s],[%s]",
-      level_low_summary, level_up_summary);
-}
-
 }  // namespace rocksdb
diff --git a/db/version_set.h b/db/version_set.h
index 51f6d9b6c..319067d1a 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -27,6 +27,7 @@
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
 
 namespace rocksdb {
 
@@ -546,118 +547,4 @@ class VersionSet {
                            VersionEdit* edit, port::Mutex* mu);
 };
 
-// A Compaction encapsulates information about a compaction.
-class Compaction {
- public:
-  ~Compaction();
-
-  // Return the level that is being compacted.  Inputs from "level"
-  // will be merged.
-  int level() const { return level_; }
-
-  // Outputs will go to this level
-  int output_level() const { return out_level_; }
-
-  // Return the object that holds the edits to the descriptor done
-  // by this compaction.
-  VersionEdit* edit() { return edit_; }
-
-  // "which" must be either 0 or 1
-  int num_input_files(int which) const { return inputs_[which].size(); }
-
-  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
-  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
-
-  // Maximum size of files to build during this compaction.
-  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
-
-  // Whether compression will be enabled for compaction outputs
-  bool enable_compression() const { return enable_compression_; }
-
-  // Is this a trivial compaction that can be implemented by just
-  // moving a single input file to the next level (no merging or splitting)
-  bool IsTrivialMove() const;
-
-  // Add all inputs to this compaction as delete operations to *edit.
-  void AddInputDeletions(VersionEdit* edit);
-
-  // Returns true if the information we have available guarantees that
-  // the compaction is producing data in "level+1" for which no data exists
-  // in levels greater than "level+1".
-  bool IsBaseLevelForKey(const Slice& user_key);
-
-  // Returns true iff we should stop building the current output
-  // before processing "internal_key".
-  bool ShouldStopBefore(const Slice& internal_key);
-
-  // Release the input version for the compaction, once the compaction
-  // is successful.
-  void ReleaseInputs();
-
-  void Summary(char* output, int len);
-
-  // Return the score that was used to pick this compaction run.
-  double score() const { return score_; }
-
-  // Is this compaction creating a file in the bottom most level?
-  bool BottomMostLevel() { return bottommost_level_; }
-
-  // Does this compaction include all sst files?
-  bool IsFullCompaction() { return is_full_compaction_; }
-
- private:
-  friend class Version;
-  friend class VersionSet;
-
-  Compaction(Version* input_version, int level, int out_level,
-             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
-             bool seek_compaction = false, bool enable_compression = true);
-
-  int level_;
-  int out_level_; // levels to which output files are stored
-  uint64_t max_output_file_size_;
-  uint64_t maxGrandParentOverlapBytes_;
-  Version* input_version_;
-  VersionEdit* edit_;
-  int number_levels_;
-
-  bool seek_compaction_;
-  bool enable_compression_;
-
-  // Each compaction reads inputs from "level_" and "level_+1"
-  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
-
-  // State used to check for number of of overlapping grandparent files
-  // (parent == level_ + 1, grandparent == level_ + 2)
-  std::vector<FileMetaData*> grandparents_;
-  size_t grandparent_index_;  // Index in grandparent_starts_
-  bool seen_key_;             // Some output key has been seen
-  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
-                              // and grandparent files
-  int base_index_;   // index of the file in files_[level_]
-  int parent_index_; // index of some file with same range in files_[level_+1]
-  double score_;     // score that was used to pick this compaction.
-
-  // Is this compaction creating a file in the bottom most level?
-  bool bottommost_level_;
-  // Does this compaction include all sst files?
-  bool is_full_compaction_;
-
-  // level_ptrs_ holds indices into input_version_->levels_: our state
-  // is that we are positioned at one of the file ranges for each
-  // higher level than the ones involved in this compaction (i.e. for
-  // all L >= level_ + 2).
-  std::vector<size_t> level_ptrs_;
-
-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool);
-
-  // Initialize whether compaction producing files at the bottommost level
-  void SetupBottomMostLevel(bool isManual);
-
-  // In case of compaction error, reset the nextIndex that is used
-  // to pick up the next file to be compacted from files_by_size_
-  void ResetNextCompactionIndex();
-};
-
 }  // namespace rocksdb

From 787f11bb3bbd1539de1cfece609af1131e4eae9a Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:23:36 -0800
Subject: [PATCH 19/27] Move more functions from VersionSet to Version

Summary:
This moves functions:
* VersionSet::Finalize() -> Version::UpdateCompactionStats()
* VersionSet::UpdateFilesBySize() -> Version::UpdateFilesBySize()

The diff depends on D15189, D15183 and D15171

Test Plan: make check

Reviewers: kailiu, sdong, haobo, dhruba

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15201
---
 db/version_set.cc | 315 +++++++++++++++++++++++-----------------------
 db/version_set.h  |  27 ++--
 2 files changed, 173 insertions(+), 169 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 05e7c7053..64ebb1427 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -590,6 +590,159 @@ bool Version::UpdateStats(const GetStats& stats) {
   return false;
 }
 
+void Version::Finalize(std::vector<uint64_t>& size_being_compacted) {
+  // Pre-sort level0 for Get()
+  if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirstBySeqNo);
+  } else {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirst);
+  }
+
+  double max_score = 0;
+  int max_score_level = 0;
+
+  int num_levels_to_check =
+      (vset_->options_->compaction_style != kCompactionStyleUniversal)
+          ? NumberLevels() - 1
+          : 1;
+
+  for (int level = 0; level < num_levels_to_check; level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int numfiles = 0;
+      for (unsigned int i = 0; i < files_[level].size(); i++) {
+        if (!files_[level][i]->being_compacted) {
+          numfiles++;
+        }
+      }
+
+      // If we are slowing down writes, then we better compact that first
+      if (numfiles >= vset_->options_->level0_stop_writes_trigger) {
+        score = 1000000;
+        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
+      } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) {
+        score = 10000;
+        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
+      } else {
+        score = static_cast<double>(numfiles) /
+                vset_->options_->level0_file_num_compaction_trigger;
+        if (score >= 1) {
+          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes =
+          TotalFileSize(files_[level]) - size_being_compacted[level];
+      score = static_cast<double>(level_bytes) / vset_->MaxBytesForLevel(level);
+      if (score > 1) {
+        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
+      }
+      if (max_score < score) {
+        max_score = score;
+        max_score_level = level;
+      }
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // update the max compaction score in levels 1 to n-1
+  max_compaction_score_ = max_score;
+  max_compaction_score_level_ = max_score_level;
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < NumberLevels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+}
+
+namespace {
+
+// Compator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareSizeDescending(const Version::Fsize& first,
+                           const Version::Fsize& second) {
+  return (first.file->file_size > second.file->file_size);
+}
+// A static compator used to sort files based on their seqno
+// In universal style : descending seqno
+bool CompareSeqnoDescending(const Version::Fsize& first,
+                            const Version::Fsize& second) {
+  if (first.file->smallest_seqno > second.file->smallest_seqno) {
+    assert(first.file->largest_seqno > second.file->largest_seqno);
+    return true;
+  }
+  assert(first.file->largest_seqno <= second.file->largest_seqno);
+  return false;
+}
+
+} // anonymous namespace
+
+void Version::UpdateFilesBySize() {
+  // No need to sort the highest level because it is never compacted.
+  int max_level =
+      (vset_->options_->compaction_style == kCompactionStyleUniversal)
+          ? NumberLevels()
+          : NumberLevels() - 1;
+
+  for (int level = 0; level < max_level; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    std::vector<int>& files_by_size = files_by_size_[level];
+    assert(files_by_size.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (unsigned int i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+      int num = temp.size();
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSeqnoDescending);
+    } else {
+      int num = Version::number_of_files_to_sort_;
+      if (num > (int)temp.size()) {
+        num = temp.size();
+      }
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSizeDescending);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_size_
+    for (unsigned int i = 0; i < temp.size(); i++) {
+      files_by_size.push_back(temp[i].index);
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_size_[level].size());
+  }
+}
+
 void Version::Ref() {
   ++refs_;
 }
@@ -1344,8 +1497,8 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
     // The calls to Finalize and UpdateFilesBySize are cpu-heavy
     // and is best called outside the mutex.
-    Finalize(v, size_being_compacted);
-    UpdateFilesBySize(v);
+    v->Finalize(size_being_compacted);
+    v->UpdateFilesBySize();
 
     // Write new record to MANIFEST log
     if (s.ok()) {
@@ -1580,7 +1733,7 @@ Status VersionSet::Recover() {
     // Install recovered version
     std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
-    Finalize(v, size_being_compacted);
+    v->Finalize(size_being_compacted);
 
     manifest_file_size_ = manifest_file_size;
     AppendVersion(v);
@@ -1712,7 +1865,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     // Install recovered version
     std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
-    Finalize(v, size_being_compacted);
+    v->Finalize(size_being_compacted);
 
     AppendVersion(v);
     manifest_file_number_ = next_file;
@@ -1740,158 +1893,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
   }
 }
 
-void VersionSet::Finalize(Version* v,
-                          std::vector<uint64_t>& size_being_compacted) {
-  // Pre-sort level0 for Get()
-  if (options_->compaction_style == kCompactionStyleUniversal) {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
-  } else {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
-  }
-
-  double max_score = 0;
-  int max_score_level = 0;
-
-  int num_levels_to_check =
-      (options_->compaction_style != kCompactionStyleUniversal) ?
-          v->NumberLevels() - 1 : 1;
-
-  for (int level = 0; level < num_levels_to_check; level++) {
-
-    double score;
-    if (level == 0) {
-      // We treat level-0 specially by bounding the number of files
-      // instead of number of bytes for two reasons:
-      //
-      // (1) With larger write-buffer sizes, it is nice not to do too
-      // many level-0 compactions.
-      //
-      // (2) The files in level-0 are merged on every read and
-      // therefore we wish to avoid too many files when the individual
-      // file size is small (perhaps because of a small write-buffer
-      // setting, or very high compression ratios, or lots of
-      // overwrites/deletions).
-      int numfiles = 0;
-      for (unsigned int i = 0; i < v->files_[level].size(); i++) {
-        if (!v->files_[level][i]->being_compacted) {
-          numfiles++;
-        }
-      }
-
-      // If we are slowing down writes, then we better compact that first
-      if (numfiles >= options_->level0_stop_writes_trigger) {
-        score = 1000000;
-        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
-      } else if (numfiles >= options_->level0_slowdown_writes_trigger) {
-        score = 10000;
-        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
-      } else {
-        score = numfiles /
-          static_cast<double>(options_->level0_file_num_compaction_trigger);
-        if (score >= 1) {
-          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
-        }
-      }
-    } else {
-      // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes = TotalFileSize(v->files_[level]) -
-                                   size_being_compacted[level];
-      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
-      if (score > 1) {
-        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
-      }
-      if (max_score < score) {
-        max_score = score;
-        max_score_level = level;
-      }
-    }
-    v->compaction_level_[level] = level;
-    v->compaction_score_[level] = score;
-  }
-
-  // update the max compaction score in levels 1 to n-1
-  v->max_compaction_score_ = max_score;
-  v->max_compaction_score_level_ = max_score_level;
-
-  // sort all the levels based on their score. Higher scores get listed
-  // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i < v->NumberLevels() - 2; i++) {
-    for (int j = i + 1; j < v->NumberLevels() - 1; j++) {
-      if (v->compaction_score_[i] < v->compaction_score_[j]) {
-        double score = v->compaction_score_[i];
-        int level = v->compaction_level_[i];
-        v->compaction_score_[i] = v->compaction_score_[j];
-        v->compaction_level_[i] = v->compaction_level_[j];
-        v->compaction_score_[j] = score;
-        v->compaction_level_[j] = level;
-      }
-    }
-  }
-}
-
-// A static compator used to sort files based on their size
-// In normal mode: descending size
-static bool compareSizeDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  return (first.file->file_size > second.file->file_size);
-}
-// A static compator used to sort files based on their seqno
-// In universal style : descending seqno
-static bool compareSeqnoDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  if (first.file->smallest_seqno > second.file->smallest_seqno) {
-    assert(first.file->largest_seqno > second.file->largest_seqno);
-    return true;
-  }
-  assert(first.file->largest_seqno <= second.file->largest_seqno);
-  return false;
-}
-
-// sort all files in level1 to level(n-1) based on file size
-void VersionSet::UpdateFilesBySize(Version* v) {
-
-  // No need to sort the highest level because it is never compacted.
-  int max_level = (options_->compaction_style == kCompactionStyleUniversal)
-                      ? v->NumberLevels()
-                      : v->NumberLevels() - 1;
-
-  for (int level = 0; level < max_level; level++) {
-
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    std::vector<int>& files_by_size = v->files_by_size_[level];
-    assert(files_by_size.size() == 0);
-
-    // populate a temp vector for sorting based on size
-    std::vector<Fsize> temp(files.size());
-    for (unsigned int i = 0; i < files.size(); i++) {
-      temp[i].index = i;
-      temp[i].file = files[i];
-    }
-
-    // sort the top number_of_files_to_sort_ based on file size
-    if (options_->compaction_style == kCompactionStyleUniversal) {
-      int num = temp.size();
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSeqnoDescending);
-    } else {
-      int num = Version::number_of_files_to_sort_;
-      if (num > (int)temp.size()) {
-        num = temp.size();
-      }
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSizeDescending);
-    }
-    assert(temp.size() == files.size());
-
-    // initialize files_by_size_
-    for (unsigned int i = 0; i < temp.size(); i++) {
-      files_by_size.push_back(temp[i].index);
-    }
-    v->next_file_to_compact_by_size_[level] = 0;
-    assert(v->files_[level].size() == v->files_by_size_[level].size());
-  }
-}
-
 Status VersionSet::WriteSnapshot(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
@@ -2586,7 +2587,7 @@ Compaction* VersionSet::PickCompaction() {
   // and also in LogAndApply(), otherwise the values could be stale.
   std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
   current_->vset_->SizeBeingCompacted(size_being_compacted);
-  Finalize(current_, size_being_compacted);
+  current_->Finalize(size_being_compacted);
 
   // In universal style of compaction, compact L0 files back into L0.
   if (options_->compaction_style ==  kCompactionStyleUniversal) {
diff --git a/db/version_set.h b/db/version_set.h
index 319067d1a..8651a6eb3 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -87,6 +87,11 @@ class Version {
   // REQUIRES: lock is held
   bool UpdateStats(const GetStats& stats);
 
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // Also pre-sorts level0 files for Get()
+  void Finalize(std::vector<uint64_t>& size_being_compacted);
+
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
   void Ref();
@@ -170,6 +175,12 @@ class Version {
   // Returns the version nuber of this version
   uint64_t GetVersionNumber() const { return version_number_; }
 
+  // used to sort files by size
+  struct Fsize {
+    int index;
+    FileMetaData* file;
+  };
+
  private:
   friend class Compaction;
   friend class VersionSet;
@@ -182,6 +193,10 @@ class Version {
   bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
                       const Slice& internal_prefix, Iterator* level_iter) const;
 
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
+
   VersionSet* vset_;            // VersionSet to which this Version belongs
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
@@ -417,16 +432,6 @@ class VersionSet {
   // pick the same files to compact.
   bool VerifyCompactionFileConsistency(Compaction* c);
 
-  // used to sort files by size
-  typedef struct fsize {
-    int index;
-    FileMetaData* file;
-  } Fsize;
-
-  // Sort all files for this version based on their file size and
-  // record results in files_by_size_. The largest files are listed first.
-  void UpdateFilesBySize(Version *v);
-
   // Get the max file size in a given level.
   uint64_t MaxFileSizeForLevel(int level);
 
@@ -449,8 +454,6 @@ class VersionSet {
 
   void Init(int num_levels);
 
-  void Finalize(Version* v, std::vector<uint64_t>&);
-
   void GetRange(const std::vector<FileMetaData*>& inputs,
                 InternalKey* smallest,
                 InternalKey* largest);

From eae1804f29585cc643dee798a52d71569d1d90de Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Wed, 15 Jan 2014 18:17:58 -0800
Subject: [PATCH 20/27] Remove the unnecessary use of shared_ptr

Summary:
shared_ptr is slower than unique_ptr (which literally comes with no performance cost compare with raw pointers).
In memtable and memtable rep, we use shared_ptr when we'd actually should use unique_ptr.

According to igor's previous work, we are likely to make quite some performance gain from this diff.

Test Plan: make check

Reviewers: dhruba, igor, sdong, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15213
---
 db/memtable.cc                | 20 ++++++-------
 db/memtable.h                 |  2 +-
 db/version_set.cc             |  2 +-
 include/rocksdb/memtablerep.h | 24 +++++++---------
 util/hash_skiplist_rep.cc     | 54 ++++++++++++++---------------------
 util/hash_skiplist_rep.h      |  4 +--
 util/skiplistrep.cc           | 10 +++----
 util/vectorrep.cc             | 14 ++++-----
 8 files changed, 58 insertions(+), 72 deletions(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index baff4fb34..7eb4eb165 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -85,11 +85,11 @@ class MemTableIterator: public Iterator {
   MemTableIterator(MemTableRep* table, const ReadOptions& options)
     : iter_() {
     if (options.prefix) {
-      iter_ = table->GetPrefixIterator(*options.prefix);
+      iter_.reset(table->GetPrefixIterator(*options.prefix));
     } else if (options.prefix_seek) {
-      iter_ = table->GetDynamicPrefixIterator();
+      iter_.reset(table->GetDynamicPrefixIterator());
     } else {
-      iter_ = table->GetIterator();
+      iter_.reset(table->GetIterator());
     }
   }
 
@@ -110,7 +110,7 @@ class MemTableIterator: public Iterator {
   virtual Status status() const { return Status::OK(); }
 
  private:
-  std::shared_ptr<MemTableRep::Iterator> iter_;
+  std::unique_ptr<MemTableRep::Iterator> iter_;
   std::string tmp_;       // For passing to EncodeKey
 
   // No copying allowed
@@ -161,8 +161,8 @@ void MemTable::Add(SequenceNumber s, ValueType type,
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext& merge_context, const Options& options) {
   Slice memkey = key.memtable_key();
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(key.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
   iter->Seek(memkey.data());
 
   bool merge_in_progress = s->IsMergeInProgress();
@@ -267,8 +267,8 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(lkey.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(lkey.user_key()));
   iter->Seek(memkey.data());
 
   if (iter->Valid()) {
@@ -329,8 +329,8 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
   // A total ordered iterator is costly for some memtablerep (prefix aware
   // reps). By passing in the user key, we allow efficient iterator creation.
   // The iterator only needs to be ordered within the same user key.
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(key.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
   iter->Seek(memkey.data());
 
   size_t num_successive_merges = 0;
diff --git a/db/memtable.h b/db/memtable.h
index 24a2c852b..1b9005800 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -143,7 +143,7 @@ class MemTable {
   KeyComparator comparator_;
   int refs_;
   ArenaImpl arena_impl_;
-  shared_ptr<MemTableRep> table_;
+  unique_ptr<MemTableRep> table_;
 
   // These are used to manage memtable flushes to storage
   bool flush_in_progress_; // started the flush
diff --git a/db/version_set.cc b/db/version_set.cc
index 64ebb1427..22135b947 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -698,7 +698,7 @@ bool CompareSeqnoDescending(const Version::Fsize& first,
   return false;
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 void Version::UpdateFilesBySize() {
   // No need to sort the highest level because it is never compacted.
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index fcb782d41..2fca8d161 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -111,27 +111,23 @@ class MemTableRep {
   };
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<Iterator> GetIterator() = 0;
+  virtual Iterator* GetIterator() = 0;
 
   // Return an iterator over at least the keys with the specified user key. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
-    return GetIterator();
-  }
+  virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); }
 
   // Return an iterator over at least the keys with the specified prefix. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
+  virtual Iterator* GetPrefixIterator(const Slice& prefix) {
     return GetIterator();
   }
 
   // Return an iterator that has a special Seek semantics. The result of
   // a Seek might only include keys with the same prefix as the target key.
-  virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
-    return GetIterator();
-  }
+  virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); }
 
  protected:
   // When *key is an internal key concatenated with the value, returns the
@@ -144,8 +140,8 @@ class MemTableRep {
 class MemTableRepFactory {
  public:
   virtual ~MemTableRepFactory() { };
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) = 0;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) = 0;
   virtual const char* Name() const = 0;
 };
 
@@ -161,8 +157,8 @@ class VectorRepFactory : public MemTableRepFactory {
   const size_t count_;
 public:
   explicit VectorRepFactory(size_t count = 0) : count_(count) { }
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) override;
   virtual const char* Name() const override {
     return "VectorRepFactory";
   }
@@ -171,8 +167,8 @@ public:
 // This uses a skip list to store keys. It is the default.
 class SkipListFactory : public MemTableRepFactory {
 public:
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+ virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                        Arena*) override;
   virtual const char* Name() const override {
     return "SkipListFactory";
   }
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index c669769e0..e9fe1573a 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -31,17 +31,15 @@ class HashSkipListRep : public MemTableRep {
 
   virtual ~HashSkipListRep();
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
-      const Slice& slice) override;
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator(
-      const Slice& prefix) override;
-
-  virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
+  virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
       override;
 
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
  private:
   friend class DynamicIterator;
   typedef SkipList<const char*, MemTableRep::KeyComparator&> Bucket;
@@ -208,18 +206,15 @@ class HashSkipListRep : public MemTableRep {
     virtual void SeekToLast() { }
    private:
   };
-
-  std::shared_ptr<EmptyIterator> empty_iterator_;
 };
 
 HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
-    Arena* arena, const SliceTransform* transform, size_t bucket_size)
-  : bucket_size_(bucket_size),
-    transform_(transform),
-    compare_(compare),
-    arena_(arena),
-    empty_iterator_(std::make_shared<EmptyIterator>()) {
-
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size)
+    : bucket_size_(bucket_size),
+      transform_(transform),
+      compare_(compare),
+      arena_(arena) {
   buckets_ = new port::AtomicPointer[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
@@ -263,7 +258,7 @@ size_t HashSkipListRep::ApproximateMemoryUsage() {
   return sizeof(buckets_);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
+MemTableRep::Iterator* HashSkipListRep::GetIterator() {
   auto list = new Bucket(compare_, arena_);
   for (size_t i = 0; i < bucket_size_; ++i) {
     auto bucket = GetBucket(i);
@@ -274,35 +269,30 @@ std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
       }
     }
   }
-  return std::make_shared<Iterator>(list);
+  return new Iterator(list);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetPrefixIterator(
-  const Slice& prefix) {
+MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) {
   auto bucket = GetBucket(prefix);
   if (bucket == nullptr) {
-    return empty_iterator_;
+    return new EmptyIterator();
   }
-  return std::make_shared<Iterator>(bucket, false);
+  return new Iterator(bucket, false);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator(
-    const Slice& slice) {
+MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) {
   return GetPrefixIterator(transform_->Transform(slice));
 }
 
-std::shared_ptr<MemTableRep::Iterator>
-    HashSkipListRep::GetDynamicPrefixIterator() {
-  return std::make_shared<DynamicIterator>(*this);
+MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
 }
 
 } // anon namespace
 
-std::shared_ptr<MemTableRep>
-HashSkipListRepFactory::CreateMemTableRep(MemTableRep::KeyComparator &compare,
-                                          Arena *arena) {
-  return std::make_shared<HashSkipListRep>(compare, arena, transform_,
-      bucket_count_);
+MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new HashSkipListRep(compare, arena, transform_, bucket_count_);
 }
 
 MemTableRepFactory* NewHashSkipListRepFactory(
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
index b946cf05e..7b8414c88 100644
--- a/util/hash_skiplist_rep.h
+++ b/util/hash_skiplist_rep.h
@@ -21,8 +21,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
 
   virtual ~HashSkipListRepFactory() { delete transform_; }
 
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-      MemTableRep::KeyComparator& compare, Arena* arena) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
+                                         Arena* arena) override;
 
   virtual const char* Name() const override {
     return "HashSkipListRepFactory";
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index 955d754b1..a5b072ad1 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -90,15 +90,15 @@ public:
   // Unhide default implementations of GetIterator
   using MemTableRep::GetIterator;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override {
-    return std::make_shared<SkipListRep::Iterator>(&skip_list_);
+  virtual MemTableRep::Iterator* GetIterator() override {
+    return new SkipListRep::Iterator(&skip_list_);
   }
 };
 }
 
-std::shared_ptr<MemTableRep> SkipListFactory::CreateMemTableRep (
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-    return std::shared_ptr<MemTableRep>(new SkipListRep(compare, arena));
+MemTableRep* SkipListFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new SkipListRep(compare, arena);
 }
 
 } // namespace rocksdb
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index 8d3ccc9df..87fae4bc7 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -88,7 +88,7 @@ class VectorRep : public MemTableRep {
   using MemTableRep::GetIterator;
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
  private:
   friend class Iterator;
@@ -228,22 +228,22 @@ void VectorRep::Iterator::SeekToLast() {
   }
 }
 
-std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() {
+MemTableRep::Iterator* VectorRep::GetIterator() {
   ReadLock l(&rwlock_);
   // Do not sort here. The sorting would be done the first time
   // a Seek is performed on the iterator.
   if (immutable_) {
-    return std::make_shared<Iterator>(this, bucket_, compare_);
+    return new Iterator(this, bucket_, compare_);
   } else {
     std::shared_ptr<Bucket> tmp;
     tmp.reset(new Bucket(*bucket_)); // make a copy
-    return std::make_shared<Iterator>(nullptr, tmp, compare_);
+    return new Iterator(nullptr, tmp, compare_);
   }
 }
 } // anon namespace
 
-std::shared_ptr<MemTableRep> VectorRepFactory::CreateMemTableRep(
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-  return std::make_shared<VectorRep>(compare, arena, count_);
+MemTableRep* VectorRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new VectorRep(compare, arena, count_);
 }
 } // namespace rocksdb

From c699c84af47d454837b1e6e0aa64c55b69ee8ff4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 16 Jan 2014 13:03:52 -0800
Subject: [PATCH 21/27] CompactionPicker

Summary:
This is a big one. This diff moves all the code related to picking compactions from VersionSet to new class CompactionPicker. Column families' compactions will be completely separate processes, so we need to have multiple CompactionPickers.

To make this easier to review, most of the code change is just copy/paste. There is also a small change not to use VersionSet::current_, but rather to take `Version* version` as a parameter. Most of the other code is exactly the same.

In future diffs, I will also make some improvements to CompactionPickers. I think the most important part will be encapsulating it better. Currently Version, VersionSet, Compaction and CompactionPicker are all friend classes, which makes it harder to change the implementation.

This diff depends on D15171, D15183, D15189 and D15201

Test Plan: `make check`

Reviewers: kailiu, sdong, dhruba, haobo

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15207
---
 db/compaction.h                     |   3 +
 db/compaction_picker.cc             | 854 +++++++++++++++++++++++++++
 db/compaction_picker.h              | 152 +++++
 db/version_set.cc                   | 856 +---------------------------
 db/version_set.h                    |  69 +--
 db/version_set_reduce_num_levels.cc |   4 +-
 6 files changed, 1041 insertions(+), 897 deletions(-)
 create mode 100644 db/compaction_picker.cc
 create mode 100644 db/compaction_picker.h

diff --git a/db/compaction.h b/db/compaction.h
index 4cc0197da..5e696a053 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -76,6 +76,9 @@ class Compaction {
  private:
   friend class Version;
   friend class VersionSet;
+  friend class CompactionPicker;
+  friend class UniversalCompactionPicker;
+  friend class LevelCompactionPicker;
 
   Compaction(Version* input_version, int level, int out_level,
              uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
new file mode 100644
index 000000000..fa2fbc663
--- /dev/null
+++ b/db/compaction_picker.cc
@@ -0,0 +1,854 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_picker.h"
+
+namespace rocksdb {
+
+namespace {
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+}  // anonymous namespace
+
+CompactionPicker::CompactionPicker(const Options* options,
+                                   const InternalKeyComparator* icmp)
+    : compactions_in_progress_(options->num_levels),
+      options_(options),
+      num_levels_(options->num_levels),
+      icmp_(icmp) {
+  Init();
+}
+
+void CompactionPicker::ReduceNumberOfLevels(int new_levels) {
+  num_levels_ = new_levels;
+  Init();
+}
+
+void CompactionPicker::Init() {
+  max_file_size_.reset(new uint64_t[NumberLevels()]);
+  level_max_bytes_.reset(new uint64_t[NumberLevels()]);
+  int target_file_size_multiplier = options_->target_file_size_multiplier;
+  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
+  for (int i = 0; i < NumberLevels(); i++) {
+    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
+      max_file_size_[i] = ULLONG_MAX;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    } else if (i > 1) {
+      max_file_size_[i] = max_file_size_[i - 1] * target_file_size_multiplier;
+      level_max_bytes_[i] =
+          level_max_bytes_[i - 1] * max_bytes_multiplier *
+          options_->max_bytes_for_level_multiplier_additional[i - 1];
+    } else {
+      max_file_size_[i] = options_->target_file_size_base;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    }
+  }
+}
+
+CompactionPicker::~CompactionPicker() {}
+
+void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
+    uint64_t total = 0;
+    for (auto c : compactions_in_progress_[level]) {
+      assert(c->level() == level);
+      for (int i = 0; i < c->num_input_files(0); i++) {
+        total += c->input(0,i)->file_size;
+      }
+    }
+    sizes[level] = total;
+  }
+}
+
+// Clear all files to indicate that they are not being compacted
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+  c->MarkFilesBeingCompacted(false);
+  compactions_in_progress_[c->level()].erase(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return max_file_size_[level];
+}
+
+uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->max_grandparent_overlap_factor;
+  return result;
+}
+
+double CompactionPicker::MaxBytesForLevel(int level) {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return level_max_bytes_[level];
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
+                                InternalKey* smallest, InternalKey* largest) {
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (i == 0) {
+      *smallest = f->smallest;
+      *largest = f->largest;
+    } else {
+      if (icmp_->Compare(f->smallest, *smallest) < 0) {
+        *smallest = f->smallest;
+      }
+      if (icmp_->Compare(f->largest, *largest) > 0) {
+        *largest = f->largest;
+      }
+    }
+  }
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
+                                const std::vector<FileMetaData*>& inputs2,
+                                InternalKey* smallest, InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
+// Add more files to the inputs on "level" to make sure that
+// no newer version of a key is compacted to "level+1" while leaving an older
+// version in a "level". Otherwise, any Get() will search "level" first,
+// and will likely return an old/stale value for the key, since it always
+// searches in increasing order of level to find the value. This could
+// also scramble the order of merge operands. This function should be
+// called any time a new Compaction is created, and its inputs_[0] are
+// populated.
+//
+// Will set c to nullptr if it is impossible to apply this compaction.
+void CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
+  // If inputs are empty then there is nothing to expand.
+  if (!c || c->inputs_[0].empty()) {
+    return;
+  }
+
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (c->level() == 0) {
+    return;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Keep expanding c->inputs_[0] until we are sure that there is a
+  // "clean cut" boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = c->inputs_[0].size();
+    GetRange(c->inputs_[0], &smallest, &largest);
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
+  } while(c->inputs_[0].size() > old_size);
+
+  // Get the new range
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  int parent_index = -1;
+  if (FilesInCompaction(c->inputs_[0]) ||
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                               &parent_index))) {
+    c->inputs_[0].clear();
+    c->inputs_[1].clear();
+    delete c;
+    c = nullptr;
+  }
+}
+
+uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->expanded_compaction_factor;
+  return result;
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
+  for (unsigned int i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::ParentRangeInCompaction(Version* version,
+                                               const InternalKey* smallest,
+                                               const InternalKey* largest,
+                                               int level, int* parent_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level + 1 < NumberLevels());
+
+  version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                *parent_index, parent_index);
+  return FilesInCompaction(inputs);
+}
+
+// Populates the set of inputs from "level+1" that overlap with "level".
+// Will also attempt to expand "level" if that doesn't expand "level+1"
+// or cause "level" to include a file for compaction that has an overlapping
+// user-key with another file.
+void CompactionPicker::SetupOtherInputs(Compaction* c) {
+  // If inputs are empty, then there is nothing to expand.
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
+    return;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_[1]) to include in compaction
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
+
+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!c->inputs_[1].empty()) {
+    std::vector<FileMetaData*> expanded0;
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
+    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+    const uint64_t expanded0_size = TotalFileSize(expanded0);
+    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
+    if (expanded0.size() > c->inputs_[0].size() &&
+        inputs1_size + expanded0_size < limit &&
+        !FilesInCompaction(expanded0) &&
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded0, &new_start, &new_limit);
+      std::vector<FileMetaData*> expanded1;
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
+      if (expanded1.size() == c->inputs_[1].size() &&
+          !FilesInCompaction(expanded1)) {
+        Log(options_->info_log,
+            "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
+            "\n",
+            (unsigned long)level,
+            (unsigned long)(c->inputs_[0].size()),
+            (unsigned long)(c->inputs_[1].size()),
+            (unsigned long)inputs0_size,
+            (unsigned long)inputs1_size,
+            (unsigned long)(expanded0.size()),
+            (unsigned long)(expanded1.size()),
+            (unsigned long)expanded0_size,
+            (unsigned long)inputs1_size);
+        smallest = new_start;
+        largest = new_limit;
+        c->inputs_[0] = expanded0;
+        c->inputs_[1] = expanded1;
+        GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      }
+    }
+  }
+
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < NumberLevels()) {
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
+  }
+}
+
+
+Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
+                                           int output_level,
+                                           const InternalKey* begin,
+                                           const InternalKey* end,
+                                           InternalKey** compaction_end) {
+  std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+  version->GetOverlappingInputs(input_level, begin, end, &inputs);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
+    uint64_t total = 0;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      uint64_t s = inputs[i]->file_size;
+      total += s;
+      if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
+        inputs.resize(i + 1);
+        break;
+      }
+    }
+  }
+  Compaction* c = new Compaction(version, input_level, output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level));
+
+  c->inputs_[0] = inputs;
+  ExpandWhileOverlapping(c);
+  if (c == nullptr) {
+    Log(options_->info_log, "Could not compact due to expansion failure.\n");
+    return nullptr;
+  }
+
+  SetupOtherInputs(c);
+
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
+  // These files that are to be manaully compacted do not trample
+  // upon other files because manual compactions are processed when
+  // the system has a max of 1 background compaction thread.
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(true);
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompaction(Version* version) {
+  Compaction* c = nullptr;
+  int level = -1;
+
+  // Compute the compactions needed. It is better to do it here
+  // and also in LogAndApply(), otherwise the values could be stale.
+  std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
+  SizeBeingCompacted(size_being_compacted);
+  version->Finalize(size_being_compacted);
+
+  // We prefer compactions triggered by too much data in a level over
+  // the compactions triggered by seeks.
+  //
+  // Find the compactions by size on all levels.
+  for (int i = 0; i < NumberLevels() - 1; i++) {
+    assert(i == 0 ||
+           version->compaction_score_[i] <= version->compaction_score_[i - 1]);
+    level = version->compaction_level_[i];
+    if ((version->compaction_score_[i] >= 1)) {
+      c = PickCompactionBySize(version, level, version->compaction_score_[i]);
+      ExpandWhileOverlapping(c);
+      if (c != nullptr) {
+        break;
+      }
+    }
+  }
+
+  // Find compactions needed by seeks
+  FileMetaData* f = version->file_to_compact_;
+  if (c == nullptr && f != nullptr && !f->being_compacted) {
+
+    level = version->file_to_compact_level_;
+    int parent_index = -1;
+
+    // Only allow one level 0 compaction at a time.
+    // Do not pick this file if its parents at level+1 are being compacted.
+    if (level != 0 || compactions_in_progress_[0].empty()) {
+      if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
+                                   &parent_index)) {
+        c = new Compaction(version, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
+        c->inputs_[0].push_back(f);
+        c->parent_index_ = parent_index;
+        c->input_version_->file_to_compact_ = nullptr;
+        ExpandWhileOverlapping(c);
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    return nullptr;
+  }
+
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  if (level == 0) {
+    assert(compactions_in_progress_[0].empty());
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
+    // Note that the next call will discard the file we placed in
+    // c->inputs_[0] earlier and replace it with an overlapping set
+    // which will include the picked file.
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
+
+    // If we include more L0 files in the same compaction run it can
+    // cause the 'smallest' and 'largest' key to get extended to a
+    // larger range. So, re-invoke GetRange to get the new key range
+    GetRange(c->inputs_[0], &smallest, &largest);
+    if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                                &c->parent_index_)) {
+      delete c;
+      return nullptr;
+    }
+    assert(!c->inputs_[0].empty());
+  }
+
+  // Setup "level+1" files (inputs_[1])
+  SetupOtherInputs(c);
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(false);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
+                                                        int level,
+                                                        double score) {
+  Compaction* c = nullptr;
+
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (level == 0 && compactions_in_progress_[level].size() == 1) {
+    return nullptr;
+  }
+
+  assert(level >= 0);
+  assert(level + 1 < NumberLevels());
+  c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
+  c->score_ = score;
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
+
+  // record the first file that is not yet compacted
+  int nextIndex = -1;
+
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
+       i < file_size.size(); i++) {
+    int index = file_size[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+
+    // check to verify files are arranged in descending size
+    assert((i == file_size.size() - 1) ||
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    // remember the startIndex for the next call to PickCompaction
+    if (nextIndex == -1) {
+      nextIndex = i;
+    }
+
+    //if (i > Version::number_of_files_to_sort_) {
+    //  Log(options_->info_log, "XXX Looking at index %d", i);
+    //}
+
+    // Do not pick this file if its parents at level+1 are being compacted.
+    // Maybe we can avoid redoing this work in SetupOtherInputs
+    int parent_index = -1;
+    if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
+                                level, &parent_index)) {
+      continue;
+    }
+    c->inputs_[0].push_back(f);
+    c->base_index_ = index;
+    c->parent_index_ = parent_index;
+    break;
+  }
+
+  if (c->inputs_[0].empty()) {
+    delete c;
+    c = nullptr;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  c->input_version_->next_file_to_compact_by_size_[level] = nextIndex;
+
+  return c;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+//
+Compaction* UniversalCompactionPicker::PickCompaction(Version* version) {
+  int level = 0;
+  double score = version->compaction_score_[0];
+
+  if ((version->files_[level].size() <
+       (unsigned int)options_->level0_file_num_compaction_trigger)) {
+    Log(options_->info_log, "Universal: nothing to do\n");
+    return nullptr;
+  }
+  Version::FileSummaryStorage tmp;
+  Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
+      version->files_[level].size(),
+      version->LevelFileSummary(&tmp, 0));
+
+  // Check for size amplification first.
+  Compaction* c = PickCompactionUniversalSizeAmp(version, score);
+  if (c == nullptr) {
+
+    // Size amplification is within limits. Try reducing read
+    // amplification while maintaining file size ratios.
+    unsigned int ratio = options_->compaction_options_universal.size_ratio;
+    c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX);
+
+    // Size amplification and file size ratios are within configured limits.
+    // If max read amplification is exceeding configured limits, then force
+    // compaction without looking at filesize ratios and try to reduce
+    // the number of files to fewer than level0_file_num_compaction_trigger.
+    if (c == nullptr) {
+      unsigned int num_files = version->files_[level].size() -
+                               options_->level0_file_num_compaction_trigger;
+      c = PickCompactionUniversalReadAmp(version, score, UINT_MAX, num_files);
+    }
+  }
+  if (c == nullptr) {
+    return nullptr;
+  }
+  assert(c->inputs_[0].size() > 1);
+
+  // validate that all the chosen files are non overlapping in time
+  FileMetaData* newerfile __attribute__((unused)) = nullptr;
+  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
+    FileMetaData* f = c->inputs_[0][i];
+    assert (f->smallest_seqno <= f->largest_seqno);
+    assert(newerfile == nullptr ||
+           newerfile->smallest_seqno > f->largest_seqno);
+    newerfile = f;
+  }
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
+
+  // Is the earliest file part of this compaction?
+  int last_index = file_by_time[file_by_time.size()-1];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
+  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
+    c->bottommost_level_ = true;
+  }
+
+  // update statistics
+  if (options_->statistics != nullptr) {
+    options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION,
+                                      c->inputs_[0].size());
+  }
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  // Record whether this compaction includes all sst files.
+  // For now, it is only relevant in universal compaction mode.
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
+
+  return c;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
+    Version* version, double score, unsigned int ratio,
+    unsigned int max_number_of_files_to_compact) {
+  int level = 0;
+
+  unsigned int min_merge_width =
+    options_->compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+    options_->compaction_options_universal.max_merge_width;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  FileMetaData* f = nullptr;
+  bool done = false;
+  int start_index = 0;
+  unsigned int candidate_count;
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int max_files_to_compact = std::min(max_merge_width,
+                                       max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
+
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (f = nullptr; loop < file_by_time.size(); loop++) {
+      int index = file_by_time[loop];
+      f = version->files_[level][index];
+
+      if (!f->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      Log(options_->info_log,
+          "Universal: file %lu[%d] being compacted, skipping",
+          (unsigned long)f->number, loop);
+      f = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
+    if (f != nullptr) {
+      Log(options_->info_log, "Universal: Possible candidate file %lu[%d].",
+          (unsigned long)f->number, loop);
+    }
+
+    // Check if the suceeding files need compaction.
+    for (unsigned int i = loop+1;
+         candidate_count < max_files_to_compact && i < file_by_time.size();
+         i++) {
+      int index = file_by_time[i];
+      FileMetaData* f = version->files_[level][index];
+      if (f->being_compacted) {
+        break;
+      }
+      // pick files if the total candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      uint64_t sz = (candidate_size * (100L + ratio)) /100;
+      if (sz < f->file_size) {
+        break;
+      }
+      candidate_count++;
+      candidate_size += f->file_size;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (unsigned int i = loop;
+           i < loop + candidate_count && i < file_by_time.size(); i++) {
+       int index = file_by_time[i];
+       FileMetaData* f = version->files_[level][index];
+       Log(options_->info_log,
+           "Universal: Skipping file %lu[%d] with size %lu %d\n",
+           (unsigned long)f->number,
+           i,
+           (unsigned long)f->file_size,
+           f->being_compacted);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  unsigned int first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      options_->compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = version->NumLevelBytes(level);
+    uint64_t older_file_size = 0;
+    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
+        i--) {
+      older_file_size += version->files_[level][file_by_time[i]]->file_size;
+      if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
+  c->score_ = score;
+
+  for (unsigned int i = start_index; i < first_index_after; i++) {
+    int index = file_by_time[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
+        (unsigned long)f->number,
+        i,
+        (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
+    Version* version, double score) {
+  int level = 0;
+
+  // percentage flexibilty while reducing size amplification
+  uint64_t ratio = options_->compaction_options_universal.
+                     max_size_amplification_percent;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  unsigned int start_index = 0;
+  FileMetaData* f = nullptr;
+
+  // Skip files that are already being compacted
+  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (!f->being_compacted) {
+      start_index = loop;         // Consider this as the first candidate.
+      break;
+    }
+    Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s",
+        (unsigned long)f->number,
+        loop,
+        " cannot be a candidate to reduce size amp.\n");
+    f = nullptr;
+  }
+  if (f == nullptr) {
+    return nullptr;             // no candidate files
+  }
+
+  Log(options_->info_log, "Universal: First candidate file %lu[%d] %s",
+      (unsigned long)f->number,
+      start_index,
+      " to reduce size amp.\n");
+
+  // keep adding up all the remaining files
+  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
+       loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (f->being_compacted) {
+      Log(options_->info_log,
+          "Universal: Possible candidate file %lu[%d] %s.",
+          (unsigned long)f->number,
+          loop,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += f->file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size of earliest file
+  int index = file_by_time[file_by_time.size() - 1];
+  uint64_t earliest_file_size = version->files_[level][index]->file_size;
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * earliest_file_size) {
+    Log(options_->info_log,
+        "Universal: size amp not needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+    return nullptr;
+  } else {
+    Log(options_->info_log,
+        "Universal: size amp needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+  }
+  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
+
+  // create a compaction request
+  // We always compact all the files, so always compress.
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
+  c->score_ = score;
+  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
+    int index = file_by_time[loop];
+    f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    Log(options_->info_log,
+        "Universal: size amp picking file %lu[%d] with size %lu",
+        (unsigned long)f->number,
+        index,
+        (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
new file mode 100644
index 000000000..980c60013
--- /dev/null
+++ b/db/compaction_picker.h
@@ -0,0 +1,152 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "db/compaction.h"
+#include "rocksdb/status.h"
+#include "rocksdb/options.h"
+
+#include <vector>
+#include <memory>
+#include <set>
+
+namespace rocksdb {
+
+class Compaction;
+class Version;
+
+class CompactionPicker {
+ public:
+  CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
+  virtual ~CompactionPicker();
+
+  // See VersionSet::ReduceNumberOfLevels()
+  void ReduceNumberOfLevels(int new_levels);
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  virtual Compaction* PickCompaction(Version* version) = 0;
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(Version* version, int input_level, int output_level,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end);
+
+  // Free up the files that participated in a compaction
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // Return the total amount of data that is undergoing
+  // compactions per level
+  void SizeBeingCompacted(std::vector<uint64_t>& sizes);
+
+  // Returns maximum total overlap bytes with grandparent
+  // level (i.e., level+2) before we stop building a single
+  // file in level->level+1 compaction.
+  uint64_t MaxGrandParentOverlapBytes(int level);
+
+  // Returns maximum total bytes of data on a given level.
+  double MaxBytesForLevel(int level);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level) const;
+
+ protected:
+  int NumberLevels() const { return num_levels_; }
+
+  // Stores the minimal range that covers all entries in inputs in
+  // *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
+                InternalKey* largest);
+
+  // Stores the minimal range that covers all entries in inputs1 and inputs2
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs1,
+                const std::vector<FileMetaData*>& inputs2,
+                InternalKey* smallest, InternalKey* largest);
+
+  void ExpandWhileOverlapping(Compaction* c);
+
+  uint64_t ExpandedCompactionByteSizeLimit(int level);
+
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+
+  // Returns true if any one of the parent files are being compacted
+  bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
+                               const InternalKey* largest, int level,
+                               int* index);
+
+  void SetupOtherInputs(Compaction* c);
+
+  // record all the ongoing compactions for all levels
+  std::vector<std::set<Compaction*>> compactions_in_progress_;
+
+  // Per-level target file size.
+  std::unique_ptr<uint64_t[]> max_file_size_;
+
+  // Per-level max bytes
+  std::unique_ptr<uint64_t[]> level_max_bytes_;
+
+  const Options* const options_;
+ private:
+  void Init();
+
+  int num_levels_;
+
+  const InternalKeyComparator* const icmp_;
+};
+
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const Options* options,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version) override;
+
+ private:
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
+                                             unsigned int ratio,
+                                             unsigned int num_files);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionUniversalSizeAmp(Version* version, double score);
+};
+
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const Options* options,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version) override;
+
+ private:
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(Version* version, int level, double score);
+};
+
+}  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index 22135b947..af0883d70 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -784,7 +784,7 @@ int Version::PickLevelForMemTableOutput(
       }
       GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
       const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > vset_->MaxGrandParentOverlapBytes(level)) {
+      if (sum > vset_->compaction_picker_->MaxGrandParentOverlapBytes(level)) {
         break;
       }
       level++;
@@ -1361,13 +1361,16 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
       dummy_versions_(this),
       current_(nullptr),
       need_slowdown_for_num_level0_files_(false),
-      compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
       manifest_file_size_(0),
       storage_options_(storage_options),
       storage_options_compactions_(storage_options_) {
   compact_pointer_ = new std::string[options_->num_levels];
-  Init(options_->num_levels);
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    compaction_picker_.reset(new UniversalCompactionPicker(options_, &icmp_));
+  } else {
+    compaction_picker_.reset(new LevelCompactionPicker(options_, &icmp_));
+  }
   AppendVersion(new Version(this, current_version_number_++));
 }
 
@@ -1379,28 +1382,6 @@ VersionSet::~VersionSet() {
   }
   obsolete_files_.clear();
   delete[] compact_pointer_;
-  delete[] max_file_size_;
-  delete[] level_max_bytes_;
-}
-
-void VersionSet::Init(int num_levels) {
-  max_file_size_ = new uint64_t[num_levels];
-  level_max_bytes_ = new uint64_t[num_levels];
-  int target_file_size_multiplier = options_->target_file_size_multiplier;
-  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
-  for (int i = 0; i < num_levels; i++) {
-    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
-      max_file_size_[i] = ULLONG_MAX;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    } else if (i > 1) {
-      max_file_size_[i] = max_file_size_[i-1] * target_file_size_multiplier;
-      level_max_bytes_[i] = level_max_bytes_[i-1] * max_bytes_multiplier *
-        options_->max_bytes_for_level_multiplier_additional[i-1];
-    } else {
-      max_file_size_[i] = options_->target_file_size_base;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    }
-  }
 }
 
 void VersionSet::AppendVersion(Version* v) {
@@ -1479,7 +1460,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
   {
     // calculate the amount of data being compacted at every level
     std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
-    SizeBeingCompacted(size_being_compacted);
+    compaction_picker_->SizeBeingCompacted(size_being_compacted);
 
     mu->Unlock();
 
@@ -1732,7 +1713,7 @@ Status VersionSet::Recover() {
 
     // Install recovered version
     std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
-    SizeBeingCompacted(size_being_compacted);
+    compaction_picker_->SizeBeingCompacted(size_being_compacted);
     v->Finalize(size_being_compacted);
 
     manifest_file_size_ = manifest_file_size;
@@ -1864,7 +1845,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
 
     // Install recovered version
     std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
-    SizeBeingCompacted(size_being_compacted);
+    compaction_picker_->SizeBeingCompacted(size_being_compacted);
     v->Finalize(size_being_compacted);
 
     AppendVersion(v);
@@ -2011,41 +1992,16 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   }
 }
 
-// Stores the minimal range that covers all entries in inputs in
-// *smallest, *largest.
-// REQUIRES: inputs is not empty
-void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
-                          InternalKey* smallest,
-                          InternalKey* largest) {
-  assert(!inputs.empty());
-  smallest->Clear();
-  largest->Clear();
-  for (size_t i = 0; i < inputs.size(); i++) {
-    FileMetaData* f = inputs[i];
-    if (i == 0) {
-      *smallest = f->smallest;
-      *largest = f->largest;
-    } else {
-      if (icmp_.Compare(f->smallest, *smallest) < 0) {
-        *smallest = f->smallest;
-      }
-      if (icmp_.Compare(f->largest, *largest) > 0) {
-        *largest = f->largest;
-      }
-    }
-  }
+Compaction* VersionSet::PickCompaction() {
+  return compaction_picker_->PickCompaction(current_);
 }
 
-// Stores the minimal range that covers all entries in inputs1 and inputs2
-// in *smallest, *largest.
-// REQUIRES: inputs is not empty
-void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
-                           const std::vector<FileMetaData*>& inputs2,
-                           InternalKey* smallest,
-                           InternalKey* largest) {
-  std::vector<FileMetaData*> all = inputs1;
-  all.insert(all.end(), inputs2.begin(), inputs2.end());
-  GetRange(all, smallest, largest);
+Compaction* VersionSet::CompactRange(int input_level, int output_level,
+                                     const InternalKey* begin,
+                                     const InternalKey* end,
+                                     InternalKey** compaction_end) {
+  return compaction_picker_->CompactRange(current_, input_level, output_level,
+                                          begin, end, compaction_end);
 }
 
 Iterator* VersionSet::MakeInputIterator(Compaction* c) {
@@ -2085,29 +2041,11 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
 }
 
 double VersionSet::MaxBytesForLevel(int level) {
-  // Note: the result for level zero is not really used since we set
-  // the level-0 compaction threshold based on number of files.
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return level_max_bytes_[level];
+  return compaction_picker_->MaxBytesForLevel(level);
 }
 
 uint64_t VersionSet::MaxFileSizeForLevel(int level) {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return max_file_size_[level];
-}
-
-uint64_t VersionSet::ExpandedCompactionByteSizeLimit(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->expanded_compaction_factor;
-  return result;
-}
-
-uint64_t VersionSet::MaxGrandParentOverlapBytes(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->max_grandparent_overlap_factor;
-  return result;
+  return compaction_picker_->MaxFileSizeForLevel(level);
 }
 
 // verify that the files listed in this compaction are present
@@ -2158,697 +2096,8 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
   return true;     // everything good
 }
 
-// Clear all files to indicate that they are not being compacted
-// Delete this compaction from the list of running compactions.
 void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) {
-  c->MarkFilesBeingCompacted(false);
-  compactions_in_progress_[c->level()].erase(c);
-  if (!status.ok()) {
-    c->ResetNextCompactionIndex();
-  }
-}
-
-// The total size of files that are currently being compacted
-// at at every level upto the penultimate level.
-void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
-  for (int level = 0; level < NumberLevels() - 1; level++) {
-    uint64_t total = 0;
-    for (std::set<Compaction*>::iterator it =
-         compactions_in_progress_[level].begin();
-         it != compactions_in_progress_[level].end();
-         ++it) {
-      Compaction* c = (*it);
-      assert(c->level() == level);
-      for (int i = 0; i < c->num_input_files(0); i++) {
-        total += c->input(0,i)->file_size;
-      }
-    }
-    sizes[level] = total;
-  }
-}
-
-//
-// Look at overall size amplification. If size amplification
-// exceeeds the configured value, then do a compaction
-// of the candidate files all the way upto the earliest
-// base file (overrides configured values of file-size ratios,
-// min_merge_width and max_merge_width).
-//
-Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level,
-                                                       double score) {
-  assert (level == 0);
-
-  // percentage flexibilty while reducing size amplification
-  uint64_t ratio = options_->compaction_options_universal.
-                     max_size_amplification_percent;
-
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = current_->files_by_size_[level];
-  assert(file_by_time.size() == current_->files_[level].size());
-
-  unsigned int candidate_count = 0;
-  uint64_t candidate_size = 0;
-  unsigned int start_index = 0;
-  FileMetaData* f = nullptr;
-
-  // Skip files that are already being compacted
-  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
-    int index = file_by_time[loop];
-    f = current_->files_[level][index];
-    if (!f->being_compacted) {
-      start_index = loop;         // Consider this as the first candidate.
-      break;
-    }
-    Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s",
-        (unsigned long)f->number,
-        loop,
-        " cannot be a candidate to reduce size amp.\n");
-    f = nullptr;
-  }
-  if (f == nullptr) {
-    return nullptr;             // no candidate files
-  }
-
-  Log(options_->info_log, "Universal: First candidate file %lu[%d] %s",
-      (unsigned long)f->number,
-      start_index,
-      " to reduce size amp.\n");
-
-  // keep adding up all the remaining files
-  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
-       loop++) {
-    int index = file_by_time[loop];
-    f = current_->files_[level][index];
-    if (f->being_compacted) {
-      Log(options_->info_log,
-          "Universal: Possible candidate file %lu[%d] %s.",
-          (unsigned long)f->number,
-          loop,
-          " is already being compacted. No size amp reduction possible.\n");
-      return nullptr;
-    }
-    candidate_size += f->file_size;
-    candidate_count++;
-  }
-  if (candidate_count == 0) {
-    return nullptr;
-  }
-
-  // size of earliest file
-  int index = file_by_time[file_by_time.size() - 1];
-  uint64_t earliest_file_size = current_->files_[level][index]->file_size;
-
-  // size amplification = percentage of additional size
-  if (candidate_size * 100 < ratio * earliest_file_size) {
-    Log(options_->info_log,
-        "Universal: size amp not needed. newer-files-total-size %lu "
-        "earliest-file-size %lu",
-        (unsigned long)candidate_size,
-        (unsigned long)earliest_file_size);
-    return nullptr;
-  } else {
-    Log(options_->info_log,
-        "Universal: size amp needed. newer-files-total-size %lu "
-        "earliest-file-size %lu",
-        (unsigned long)candidate_size,
-        (unsigned long)earliest_file_size);
-  }
-  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
-
-  // create a compaction request
-  // We always compact all the files, so always compress.
-  Compaction* c =
-      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
-                     LLONG_MAX, false, true);
-  c->score_ = score;
-  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
-    int index = file_by_time[loop];
-    f = c->input_version_->files_[level][index];
-    c->inputs_[0].push_back(f);
-    Log(options_->info_log,
-        "Universal: size amp picking file %lu[%d] with size %lu",
-        (unsigned long)f->number,
-        index,
-        (unsigned long)f->file_size);
-  }
-  return c;
-}
-
-//
-// Consider compaction files based on their size differences with
-// the next file in time order.
-//
-Compaction* VersionSet::PickCompactionUniversalReadAmp(
-    int level, double score, unsigned int ratio,
-    unsigned int max_number_of_files_to_compact) {
-
-  unsigned int min_merge_width =
-    options_->compaction_options_universal.min_merge_width;
-  unsigned int max_merge_width =
-    options_->compaction_options_universal.max_merge_width;
-
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = current_->files_by_size_[level];
-  FileMetaData* f = nullptr;
-  bool done = false;
-  int start_index = 0;
-  unsigned int candidate_count;
-  assert(file_by_time.size() == current_->files_[level].size());
-
-  unsigned int max_files_to_compact = std::min(max_merge_width,
-                                       max_number_of_files_to_compact);
-  min_merge_width = std::max(min_merge_width, 2U);
-
-  // Considers a candidate file only if it is smaller than the
-  // total size accumulated so far.
-  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
-
-    candidate_count = 0;
-
-    // Skip files that are already being compacted
-    for (f = nullptr; loop < file_by_time.size(); loop++) {
-      int index = file_by_time[loop];
-      f = current_->files_[level][index];
-
-      if (!f->being_compacted) {
-        candidate_count = 1;
-        break;
-      }
-      Log(options_->info_log,
-          "Universal: file %lu[%d] being compacted, skipping",
-          (unsigned long)f->number, loop);
-      f = nullptr;
-    }
-
-    // This file is not being compacted. Consider it as the
-    // first candidate to be compacted.
-    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
-    if (f != nullptr) {
-      Log(options_->info_log, "Universal: Possible candidate file %lu[%d].",
-          (unsigned long)f->number, loop);
-    }
-
-    // Check if the suceeding files need compaction.
-    for (unsigned int i = loop+1;
-         candidate_count < max_files_to_compact && i < file_by_time.size();
-         i++) {
-      int index = file_by_time[i];
-      FileMetaData* f = current_->files_[level][index];
-      if (f->being_compacted) {
-        break;
-      }
-      // pick files if the total candidate file size (increased by the
-      // specified ratio) is still larger than the next candidate file.
-      uint64_t sz = (candidate_size * (100L + ratio)) /100;
-      if (sz < f->file_size) {
-        break;
-      }
-      candidate_count++;
-      candidate_size += f->file_size;
-    }
-
-    // Found a series of consecutive files that need compaction.
-    if (candidate_count >= (unsigned int)min_merge_width) {
-      start_index = loop;
-      done = true;
-      break;
-    } else {
-      for (unsigned int i = loop;
-           i < loop + candidate_count && i < file_by_time.size(); i++) {
-       int index = file_by_time[i];
-       FileMetaData* f = current_->files_[level][index];
-       Log(options_->info_log,
-           "Universal: Skipping file %lu[%d] with size %lu %d\n",
-           (unsigned long)f->number,
-           i,
-           (unsigned long)f->file_size,
-           f->being_compacted);
-      }
-    }
-  }
-  if (!done || candidate_count <= 1) {
-    return nullptr;
-  }
-  unsigned int first_index_after = start_index + candidate_count;
-  // Compression is enabled if files compacted earlier already reached
-  // size ratio of compression.
-  bool enable_compression = true;
-  int ratio_to_compress =
-      options_->compaction_options_universal.compression_size_percent;
-  if (ratio_to_compress >= 0) {
-    uint64_t total_size = TotalFileSize(current_->files_[level]);
-    uint64_t older_file_size = 0;
-    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
-        i--) {
-      older_file_size += current_->files_[level][file_by_time[i]]->file_size;
-      if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
-        enable_compression = false;
-        break;
-      }
-    }
-  }
-  Compaction* c =
-      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
-                     LLONG_MAX, false, enable_compression);
-  c->score_ = score;
-
-  for (unsigned int i = start_index; i < first_index_after; i++) {
-    int index = file_by_time[i];
-    FileMetaData* f = c->input_version_->files_[level][index];
-    c->inputs_[0].push_back(f);
-    Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
-        (unsigned long)f->number,
-        i,
-        (unsigned long)f->file_size);
-  }
-  return c;
-}
-
-//
-// Universal style of compaction. Pick files that are contiguous in
-// time-range to compact.
-//
-Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
-  assert (level == 0);
-
-  if ((current_->files_[level].size() <
-      (unsigned int)options_->level0_file_num_compaction_trigger)) {
-    Log(options_->info_log, "Universal: nothing to do\n");
-    return nullptr;
-  }
-  Version::FileSummaryStorage tmp;
-  Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
-      current_->files_[level].size(),
-      current_->LevelFileSummary(&tmp, 0));
-
-  // Check for size amplification first.
-  Compaction* c = PickCompactionUniversalSizeAmp(level, score);
-  if (c == nullptr) {
-
-    // Size amplification is within limits. Try reducing read
-    // amplification while maintaining file size ratios.
-    unsigned int ratio = options_->compaction_options_universal.size_ratio;
-    c = PickCompactionUniversalReadAmp(level, score, ratio, UINT_MAX);
-
-    // Size amplification and file size ratios are within configured limits.
-    // If max read amplification is exceeding configured limits, then force
-    // compaction without looking at filesize ratios and try to reduce
-    // the number of files to fewer than level0_file_num_compaction_trigger.
-    if (c == nullptr) {
-      unsigned int num_files = current_->files_[level].size() -
-                               options_->level0_file_num_compaction_trigger;
-      c = PickCompactionUniversalReadAmp(level, score, UINT_MAX, num_files);
-    }
-  }
-  if (c == nullptr) {
-    return nullptr;
-  }
-  assert(c->inputs_[0].size() > 1);
-
-  // validate that all the chosen files are non overlapping in time
-  FileMetaData* newerfile __attribute__((unused)) = nullptr;
-  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
-    FileMetaData* f = c->inputs_[0][i];
-    assert (f->smallest_seqno <= f->largest_seqno);
-    assert(newerfile == nullptr ||
-           newerfile->smallest_seqno > f->largest_seqno);
-    newerfile = f;
-  }
-
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
-
-  // Is the earliest file part of this compaction?
-  int last_index = file_by_time[file_by_time.size()-1];
-  FileMetaData* last_file = c->input_version_->files_[level][last_index];
-  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
-    c->bottommost_level_ = true;
-  }
-
-  // update statistics
-  if (options_->statistics != nullptr) {
-    options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION,
-                                      c->inputs_[0].size());
-  }
-
-  // mark all the files that are being compacted
-  c->MarkFilesBeingCompacted(true);
-
-  // remember this currently undergoing compaction
-  compactions_in_progress_[level].insert(c);
-
-  // Record whether this compaction includes all sst files.
-  // For now, it is only relevant in universal compaction mode.
-  c->is_full_compaction_ =
-      (c->inputs_[0].size() == c->input_version_->files_[0].size());
-
-  return c;
-}
-
-Compaction* VersionSet::PickCompactionBySize(int level, double score) {
-  Compaction* c = nullptr;
-
-  // level 0 files are overlapping. So we cannot pick more
-  // than one concurrent compactions at this level. This
-  // could be made better by looking at key-ranges that are
-  // being compacted at level 0.
-  if (level == 0 && compactions_in_progress_[level].size() == 1) {
-    return nullptr;
-  }
-
-  assert(level >= 0);
-  assert(level + 1 < current_->NumberLevels());
-  c = new Compaction(current_, level, level + 1, MaxFileSizeForLevel(level + 1),
-                     MaxGrandParentOverlapBytes(level));
-  c->score_ = score;
-
-  // Pick the largest file in this level that is not already
-  // being compacted
-  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
-
-  // record the first file that is not yet compacted
-  int nextIndex = -1;
-
-  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
-       i < file_size.size(); i++) {
-    int index = file_size[i];
-    FileMetaData* f = c->input_version_->files_[level][index];
-
-    // check to verify files are arranged in descending size
-    assert((i == file_size.size() - 1) ||
-           (i >= Version::number_of_files_to_sort_ - 1) ||
-           (f->file_size >=
-            c->input_version_->files_[level][file_size[i + 1]]->file_size));
-
-    // do not pick a file to compact if it is being compacted
-    // from n-1 level.
-    if (f->being_compacted) {
-      continue;
-    }
-
-    // remember the startIndex for the next call to PickCompaction
-    if (nextIndex == -1) {
-      nextIndex = i;
-    }
-
-    //if (i > Version::number_of_files_to_sort_) {
-    //  Log(options_->info_log, "XXX Looking at index %d", i);
-    //}
-
-    // Do not pick this file if its parents at level+1 are being compacted.
-    // Maybe we can avoid redoing this work in SetupOtherInputs
-    int parent_index = -1;
-    if (ParentRangeInCompaction(&f->smallest, &f->largest, level,
-                                &parent_index)) {
-      continue;
-    }
-    c->inputs_[0].push_back(f);
-    c->base_index_ = index;
-    c->parent_index_ = parent_index;
-    break;
-  }
-
-  if (c->inputs_[0].empty()) {
-    delete c;
-    c = nullptr;
-  }
-
-  // store where to start the iteration in the next call to PickCompaction
-  c->input_version_->next_file_to_compact_by_size_[level] = nextIndex;
-
-  return c;
-}
-
-Compaction* VersionSet::PickCompaction() {
-  Compaction* c = nullptr;
-  int level = -1;
-
-  // Compute the compactions needed. It is better to do it here
-  // and also in LogAndApply(), otherwise the values could be stale.
-  std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
-  current_->vset_->SizeBeingCompacted(size_being_compacted);
-  current_->Finalize(size_being_compacted);
-
-  // In universal style of compaction, compact L0 files back into L0.
-  if (options_->compaction_style ==  kCompactionStyleUniversal) {
-    int level = 0;
-    c = PickCompactionUniversal(level, current_->compaction_score_[level]);
-    return c;
-  }
-
-  // We prefer compactions triggered by too much data in a level over
-  // the compactions triggered by seeks.
-  //
-  // Find the compactions by size on all levels.
-  for (int i = 0; i < NumberLevels()-1; i++) {
-    assert(i == 0 || current_->compaction_score_[i] <=
-                     current_->compaction_score_[i-1]);
-    level = current_->compaction_level_[i];
-    if ((current_->compaction_score_[i] >= 1)) {
-      c = PickCompactionBySize(level, current_->compaction_score_[i]);
-      ExpandWhileOverlapping(c);
-      if (c != nullptr) {
-        break;
-      }
-    }
-  }
-
-  // Find compactions needed by seeks
-  FileMetaData* f = current_->file_to_compact_;
-  if (c == nullptr && f != nullptr && !f->being_compacted) {
-
-    level = current_->file_to_compact_level_;
-    int parent_index = -1;
-
-    // Only allow one level 0 compaction at a time.
-    // Do not pick this file if its parents at level+1 are being compacted.
-    if (level != 0 || compactions_in_progress_[0].empty()) {
-      if(!ParentRangeInCompaction(&f->smallest, &f->largest, level,
-                                  &parent_index)) {
-        c = new Compaction(current_, level, level + 1,
-                           MaxFileSizeForLevel(level + 1),
-                           MaxGrandParentOverlapBytes(level), true);
-        c->inputs_[0].push_back(f);
-        c->parent_index_ = parent_index;
-        c->input_version_->file_to_compact_ = nullptr;
-        ExpandWhileOverlapping(c);
-      }
-    }
-  }
-
-  if (c == nullptr) {
-    return nullptr;
-  }
-
-  // Two level 0 compaction won't run at the same time, so don't need to worry
-  // about files on level 0 being compacted.
-  if (level == 0) {
-    assert(compactions_in_progress_[0].empty());
-    InternalKey smallest, largest;
-    GetRange(c->inputs_[0], &smallest, &largest);
-    // Note that the next call will discard the file we placed in
-    // c->inputs_[0] earlier and replace it with an overlapping set
-    // which will include the picked file.
-    c->inputs_[0].clear();
-    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
-                                            &c->inputs_[0]);
-
-    // If we include more L0 files in the same compaction run it can
-    // cause the 'smallest' and 'largest' key to get extended to a
-    // larger range. So, re-invoke GetRange to get the new key range
-    GetRange(c->inputs_[0], &smallest, &largest);
-    if (ParentRangeInCompaction(&smallest, &largest,
-                                level, &c->parent_index_)) {
-      delete c;
-      return nullptr;
-    }
-    assert(!c->inputs_[0].empty());
-  }
-
-  // Setup "level+1" files (inputs_[1])
-  SetupOtherInputs(c);
-
-  // mark all the files that are being compacted
-  c->MarkFilesBeingCompacted(true);
-
-  // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(false);
-
-  // remember this currently undergoing compaction
-  compactions_in_progress_[level].insert(c);
-
-  return c;
-}
-
-// Returns true if any one of the parent files are being compacted
-bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
-                                         const InternalKey* largest, int level,
-                                         int* parent_index) {
-  std::vector<FileMetaData*> inputs;
-  assert(level + 1 < current_->NumberLevels());
-
-  current_->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
-                                 *parent_index, parent_index);
-  return FilesInCompaction(inputs);
-}
-
-// Returns true if any one of specified files are being compacted
-bool VersionSet::FilesInCompaction(std::vector<FileMetaData*>& files) {
-  for (unsigned int i = 0; i < files.size(); i++) {
-    if (files[i]->being_compacted) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Add more files to the inputs on "level" to make sure that
-// no newer version of a key is compacted to "level+1" while leaving an older
-// version in a "level". Otherwise, any Get() will search "level" first,
-// and will likely return an old/stale value for the key, since it always
-// searches in increasing order of level to find the value. This could
-// also scramble the order of merge operands. This function should be
-// called any time a new Compaction is created, and its inputs_[0] are
-// populated.
-//
-// Will set c to nullptr if it is impossible to apply this compaction.
-void VersionSet::ExpandWhileOverlapping(Compaction* c) {
-  // If inputs are empty then there is nothing to expand.
-  if (!c || c->inputs_[0].empty()) {
-    return;
-  }
-
-  // GetOverlappingInputs will always do the right thing for level-0.
-  // So we don't need to do any expansion if level == 0.
-  if (c->level() == 0) {
-    return;
-  }
-
-  const int level = c->level();
-  InternalKey smallest, largest;
-
-  // Keep expanding c->inputs_[0] until we are sure that there is a
-  // "clean cut" boundary between the files in input and the surrounding files.
-  // This will ensure that no parts of a key are lost during compaction.
-  int hint_index = -1;
-  size_t old_size;
-  do {
-    old_size = c->inputs_[0].size();
-    GetRange(c->inputs_[0], &smallest, &largest);
-    c->inputs_[0].clear();
-    c->input_version_->GetOverlappingInputs(
-        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
-  } while(c->inputs_[0].size() > old_size);
-
-  // Get the new range
-  GetRange(c->inputs_[0], &smallest, &largest);
-
-  // If, after the expansion, there are files that are already under
-  // compaction, then we must drop/cancel this compaction.
-  int parent_index = -1;
-  if (FilesInCompaction(c->inputs_[0]) ||
-      (c->level() != c->output_level() &&
-       ParentRangeInCompaction(&smallest, &largest, level, &parent_index))) {
-    c->inputs_[0].clear();
-    c->inputs_[1].clear();
-    delete c;
-    c = nullptr;
-  }
-}
-
-// Populates the set of inputs from "level+1" that overlap with "level".
-// Will also attempt to expand "level" if that doesn't expand "level+1"
-// or cause "level" to include a file for compaction that has an overlapping
-// user-key with another file.
-void VersionSet::SetupOtherInputs(Compaction* c) {
-  // If inputs are empty, then there is nothing to expand.
-  // If both input and output levels are the same, no need to consider
-  // files at level "level+1"
-  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
-    return;
-  }
-
-  const int level = c->level();
-  InternalKey smallest, largest;
-
-  // Get the range one last time.
-  GetRange(c->inputs_[0], &smallest, &largest);
-
-  // Populate the set of next-level files (inputs_[1]) to include in compaction
-  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
-                                          &c->inputs_[1], c->parent_index_,
-                                          &c->parent_index_);
-
-  // Get entire range covered by compaction
-  InternalKey all_start, all_limit;
-  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
-
-  // See if we can further grow the number of inputs in "level" without
-  // changing the number of "level+1" files we pick up. We also choose NOT
-  // to expand if this would cause "level" to include some entries for some
-  // user key, while excluding other entries for the same user key. This
-  // can happen when one user key spans multiple files.
-  if (!c->inputs_[1].empty()) {
-    std::vector<FileMetaData*> expanded0;
-    c->input_version_->GetOverlappingInputs(
-        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
-    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
-    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
-    const uint64_t expanded0_size = TotalFileSize(expanded0);
-    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
-    if (expanded0.size() > c->inputs_[0].size() &&
-        inputs1_size + expanded0_size < limit &&
-        !FilesInCompaction(expanded0) &&
-        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
-      InternalKey new_start, new_limit;
-      GetRange(expanded0, &new_start, &new_limit);
-      std::vector<FileMetaData*> expanded1;
-      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
-                                              &expanded1, c->parent_index_,
-                                              &c->parent_index_);
-      if (expanded1.size() == c->inputs_[1].size() &&
-          !FilesInCompaction(expanded1)) {
-        Log(options_->info_log,
-            "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
-            "\n",
-            (unsigned long)level,
-            (unsigned long)(c->inputs_[0].size()),
-            (unsigned long)(c->inputs_[1].size()),
-            (unsigned long)inputs0_size,
-            (unsigned long)inputs1_size,
-            (unsigned long)(expanded0.size()),
-            (unsigned long)(expanded1.size()),
-            (unsigned long)expanded0_size,
-            (unsigned long)inputs1_size);
-        smallest = new_start;
-        largest = new_limit;
-        c->inputs_[0] = expanded0;
-        c->inputs_[1] = expanded1;
-        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
-      }
-    }
-  }
-
-  // Compute the set of grandparent files that overlap this compaction
-  // (parent == level+1; grandparent == level+2)
-  if (level + 2 < NumberLevels()) {
-    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
-                                            &c->grandparents_);
-  }
-
-  if (false) {
-    Log(options_->info_log, "Compacting %d '%s' .. '%s'",
-        level,
-        smallest.DebugString().c_str(),
-        largest.DebugString().c_str());
-  }
-
-  // Update the place where we will do the next compaction for this level.
-  // We update this immediately instead of waiting for the VersionEdit
-  // to be applied so that if the compaction fails, we will try a different
-  // key range next time.
-  compact_pointer_[level] = largest.Encode().ToString();
-  c->edit_->SetCompactPointer(level, largest);
+  compaction_picker_->ReleaseCompactionFiles(c, status);
 }
 
 Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
@@ -2890,69 +2139,4 @@ void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
   obsolete_files_.clear();
 }
 
-Compaction* VersionSet::CompactRange(int input_level,
-                                     int output_level,
-                                     const InternalKey* begin,
-                                     const InternalKey* end,
-                                     InternalKey** compaction_end) {
-  std::vector<FileMetaData*> inputs;
-  bool covering_the_whole_range = true;
-
-  // All files are 'overlapping' in universal style compaction.
-  // We have to compact the entire range in one shot.
-  if (options_->compaction_style == kCompactionStyleUniversal) {
-    begin = nullptr;
-    end = nullptr;
-  }
-  current_->GetOverlappingInputs(input_level, begin, end, &inputs);
-  if (inputs.empty()) {
-    return nullptr;
-  }
-
-  // Avoid compacting too much in one shot in case the range is large.
-  // But we cannot do this for level-0 since level-0 files can overlap
-  // and we must not pick one file and drop another older file if the
-  // two files overlap.
-  if (input_level > 0) {
-    const uint64_t limit =
-        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
-    uint64_t total = 0;
-    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
-      uint64_t s = inputs[i]->file_size;
-      total += s;
-      if (total >= limit) {
-        **compaction_end = inputs[i + 1]->smallest;
-        covering_the_whole_range = false;
-        inputs.resize(i + 1);
-        break;
-      }
-    }
-  }
-  Compaction* c = new Compaction(current_, input_level, output_level,
-                                 MaxFileSizeForLevel(output_level),
-                                 MaxGrandParentOverlapBytes(input_level));
-
-  c->inputs_[0] = inputs;
-  ExpandWhileOverlapping(c);
-  if (c == nullptr) {
-    Log(options_->info_log, "Could not compact due to expansion failure.\n");
-    return nullptr;
-  }
-
-  SetupOtherInputs(c);
-
-  if (covering_the_whole_range) {
-    *compaction_end = nullptr;
-  }
-
-  // These files that are to be manaully compacted do not trample
-  // upon other files because manual compactions are processed when
-  // the system has a max of 1 background compaction thread.
-  c->MarkFilesBeingCompacted(true);
-
-  // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(true);
-  return c;
-}
-
 }  // namespace rocksdb
diff --git a/db/version_set.h b/db/version_set.h
index 8651a6eb3..a2a033676 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -28,12 +28,14 @@
 #include "port/port.h"
 #include "db/table_cache.h"
 #include "db/compaction.h"
+#include "db/compaction_picker.h"
 
 namespace rocksdb {
 
 namespace log { class Writer; }
 
 class Compaction;
+class CompactionPicker;
 class Iterator;
 class MemTable;
 class TableCache;
@@ -185,6 +187,9 @@ class Version {
   friend class Compaction;
   friend class VersionSet;
   friend class DBImpl;
+  friend class CompactionPicker;
+  friend class LevelCompactionPicker;
+  friend class UniversalCompactionPicker;
 
   class LevelFileNumIterator;
   Iterator* NewConcatenatingIterator(const ReadOptions&,
@@ -407,35 +412,18 @@ class VersionSet {
   // Return the size of the current manifest file
   uint64_t ManifestFileSize() const { return manifest_file_size_; }
 
-  // For the specfied level, pick a compaction.
-  // Returns nullptr if there is no compaction to be done.
-  // If level is 0 and there is already a compaction on that level, this
-  // function will return nullptr.
-  Compaction* PickCompactionBySize(int level, double score);
-
-  // Pick files to compact in Universal mode
-  Compaction* PickCompactionUniversal(int level, double score);
-
-  // Pick Universal compaction to limit read amplification
-  Compaction* PickCompactionUniversalReadAmp(int level, double score,
-                unsigned int ratio, unsigned int num_files);
-
-  // Pick Universal compaction to limit space amplification.
-  Compaction* PickCompactionUniversalSizeAmp(int level, double score);
-
-  // Free up the files that were participated in a compaction
-  void ReleaseCompactionFiles(Compaction* c, Status status);
-
   // verify that the files that we started with for a compaction
   // still exist in the current version and in the same original level.
   // This ensures that a concurrent compaction did not erroneously
   // pick the same files to compact.
   bool VerifyCompactionFileConsistency(Compaction* c);
 
+  double MaxBytesForLevel(int level);
+
   // Get the max file size in a given level.
   uint64_t MaxFileSizeForLevel(int level);
 
-  double MaxBytesForLevel(int level);
+  void ReleaseCompactionFiles(Compaction* c, Status status);
 
   Status GetMetadataForFile(
     uint64_t number, int *filelevel, FileMetaData *metadata);
@@ -452,21 +440,6 @@ class VersionSet {
   friend class Compaction;
   friend class Version;
 
-  void Init(int num_levels);
-
-  void GetRange(const std::vector<FileMetaData*>& inputs,
-                InternalKey* smallest,
-                InternalKey* largest);
-
-  void GetRange2(const std::vector<FileMetaData*>& inputs1,
-                 const std::vector<FileMetaData*>& inputs2,
-                 InternalKey* smallest,
-                 InternalKey* largest);
-
-  void ExpandWhileOverlapping(Compaction* c);
-
-  void SetupOtherInputs(Compaction* c);
-
   // Save current contents to *log
   Status WriteSnapshot(log::Writer* log);
 
@@ -474,10 +447,6 @@ class VersionSet {
 
   bool ManifestContains(const std::string& record) const;
 
-  uint64_t ExpandedCompactionByteSizeLimit(int level);
-
-  uint64_t MaxGrandParentOverlapBytes(int level);
-
   Env* const env_;
   const std::string dbname_;
   const Options* const options_;
@@ -504,14 +473,9 @@ class VersionSet {
   // Either an empty string, or a valid InternalKey.
   std::string* compact_pointer_;
 
-  // Per-level target file size.
-  uint64_t* max_file_size_;
-
-  // Per-level max bytes
-  uint64_t* level_max_bytes_;
-
-  // record all the ongoing compactions for all levels
-  std::vector<std::set<Compaction*> > compactions_in_progress_;
+  // An object that keeps all the compaction stats
+  // and picks the next compaction
+  std::unique_ptr<CompactionPicker> compaction_picker_;
 
   // generates a increasing version number for every new version
   uint64_t current_version_number_;
@@ -535,17 +499,6 @@ class VersionSet {
   VersionSet(const VersionSet&);
   void operator=(const VersionSet&);
 
-  // Return the total amount of data that is undergoing
-  // compactions per level
-  void SizeBeingCompacted(std::vector<uint64_t>&);
-
-  // Returns true if any one of the parent files are being compacted
-  bool ParentRangeInCompaction(const InternalKey* smallest,
-    const InternalKey* largest, int level, int* index);
-
-  // Returns true if any one of the specified files are being compacted
-  bool FilesInCompaction(std::vector<FileMetaData*>& files);
-
   void LogAndApplyHelper(Builder*b, Version* v,
                            VersionEdit* edit, port::Mutex* mu);
 };
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index 2ca689809..c081d2c58 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -68,11 +68,9 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   current_version->num_levels_ = new_levels;
 
   delete[] compact_pointer_;
-  delete[] max_file_size_;
-  delete[] level_max_bytes_;
   num_levels_ = new_levels;
   compact_pointer_ = new std::string[new_levels];
-  Init(new_levels);
+  compaction_picker_->ReduceNumberOfLevels(new_levels);
   VersionEdit ve;
   st = LogAndApply(&ve, mu, true);
   return st;

From 6d6fb7096026b246ae9751da08781be1f50e52b6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 16 Jan 2014 14:06:53 -0800
Subject: [PATCH 22/27] Remove compaction pointers

Summary: The only thing we do with compaction pointers is set them to some values, we never actually read them. I don't know what we used them for, but it doesn't look like we use them anymore.

Test Plan: make check

Reviewers: dhruba, haobo, kailiu, sdong

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15225
---
 db/version_edit.cc                  | 16 +++-------------
 db/version_edit.h                   |  4 ----
 db/version_edit_test.cc             |  1 -
 db/version_set.cc                   | 18 ------------------
 db/version_set.h                    |  4 ----
 db/version_set_reduce_num_levels.cc |  2 --
 6 files changed, 3 insertions(+), 42 deletions(-)

diff --git a/db/version_edit.cc b/db/version_edit.cc
index 42c07e7b0..70618c9ce 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -69,12 +69,6 @@ void VersionEdit::EncodeTo(std::string* dst) const {
     PutVarint64(dst, last_sequence_);
   }
 
-  for (size_t i = 0; i < compact_pointers_.size(); i++) {
-    PutVarint32(dst, kCompactPointer);
-    PutVarint32(dst, compact_pointers_[i].first);  // level
-    PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
-  }
-
   for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
        iter != deleted_files_.end();
        ++iter) {
@@ -178,7 +172,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
       case kCompactPointer:
         if (GetLevel(&input, &level, &msg) &&
             GetInternalKey(&input, &key)) {
-          compact_pointers_.push_back(std::make_pair(level, key));
+          // we don't use compact pointers anymore,
+          // but we should not fail if they are still
+          // in manifest
         } else {
           if (!msg) {
             msg = "compaction pointer";
@@ -267,12 +263,6 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append("\n  LastSeq: ");
     AppendNumberTo(&r, last_sequence_);
   }
-  for (size_t i = 0; i < compact_pointers_.size(); i++) {
-    r.append("\n  CompactPointer: ");
-    AppendNumberTo(&r, compact_pointers_[i].first);
-    r.append(" ");
-    r.append(compact_pointers_[i].second.DebugString(hex_key));
-  }
   for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
        iter != deleted_files_.end();
        ++iter) {
diff --git a/db/version_edit.h b/db/version_edit.h
index a0546c983..e6fe15773 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -59,9 +59,6 @@ class VersionEdit {
     has_last_sequence_ = true;
     last_sequence_ = seq;
   }
-  void SetCompactPointer(int level, const InternalKey& key) {
-    compact_pointers_.push_back(std::make_pair(level, key));
-  }
 
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
@@ -117,7 +114,6 @@ class VersionEdit {
   bool has_next_file_number_;
   bool has_last_sequence_;
 
-  std::vector<std::pair<int, InternalKey> > compact_pointers_;
   DeletedFileSet deleted_files_;
   std::vector<std::pair<int, FileMetaData> > new_files_;
 };
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 63aa32e8f..110b422f8 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -36,7 +36,6 @@ TEST(VersionEditTest, EncodeDecode) {
                  kBig + 500 + i,
                  kBig + 600 + i);
     edit.DeleteFile(4, kBig + 700 + i);
-    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
   }
 
   edit.SetComparatorName("foo");
diff --git a/db/version_set.cc b/db/version_set.cc
index af0883d70..ebd2805bc 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1245,13 +1245,6 @@ class VersionSet::Builder {
   void Apply(VersionEdit* edit) {
     CheckConsistency(base_);
 
-    // Update compaction pointers
-    for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
-      const int level = edit->compact_pointers_[i].first;
-      vset_->compact_pointer_[level] =
-          edit->compact_pointers_[i].second.Encode().ToString();
-    }
-
     // Delete files
     const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
     for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
@@ -1365,7 +1358,6 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
       manifest_file_size_(0),
       storage_options_(storage_options),
       storage_options_compactions_(storage_options_) {
-  compact_pointer_ = new std::string[options_->num_levels];
   if (options_->compaction_style == kCompactionStyleUniversal) {
     compaction_picker_.reset(new UniversalCompactionPicker(options_, &icmp_));
   } else {
@@ -1381,7 +1373,6 @@ VersionSet::~VersionSet() {
     delete file;
   }
   obsolete_files_.clear();
-  delete[] compact_pointer_;
 }
 
 void VersionSet::AppendVersion(Version* v) {
@@ -1881,15 +1872,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   VersionEdit edit;
   edit.SetComparatorName(icmp_.user_comparator()->Name());
 
-  // Save compaction pointers
-  for (int level = 0; level < NumberLevels(); level++) {
-    if (!compact_pointer_[level].empty()) {
-      InternalKey key;
-      key.DecodeFrom(compact_pointer_[level]);
-      edit.SetCompactPointer(level, key);
-    }
-  }
-
   // Save files
   for (int level = 0; level < current_->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
diff --git a/db/version_set.h b/db/version_set.h
index a2a033676..1d92629f1 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -469,10 +469,6 @@ class VersionSet {
   // we have too many level 0 files
   bool need_slowdown_for_num_level0_files_;
 
-  // Per-level key at which the next compaction at that level should start.
-  // Either an empty string, or a valid InternalKey.
-  std::string* compact_pointer_;
-
   // An object that keeps all the compaction stats
   // and picks the next compaction
   std::unique_ptr<CompactionPicker> compaction_picker_;
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index c081d2c58..68b84dab1 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -67,9 +67,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   current_version->files_ = new_files_list;
   current_version->num_levels_ = new_levels;
 
-  delete[] compact_pointer_;
   num_levels_ = new_levels;
-  compact_pointer_ = new std::string[new_levels];
   compaction_picker_->ReduceNumberOfLevels(new_levels);
   VersionEdit ve;
   st = LogAndApply(&ve, mu, true);

From e19bad9bbd333b04f37f29095d784c87488763a3 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Thu, 16 Jan 2014 14:26:51 -0800
Subject: [PATCH 23/27] Fix some "make format" issue

Summary:
* make sure when some pre-check fails, the script won't halt immediately.
* change fburl to google's short url.
* Fix a bug in this script: now it checks the uncommitted code only.

Test Plan: Ran the script under differnet environments.

Reviewers: igor

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15231
---
 build_tools/format-diff.sh | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index 758135c9f..ceae38192 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -e
 # If clang_format_diff.py command is not specfied, we assume we are able to
 # access directly without any path.
 if [ -z $CLANG_FORMAT_DIFF ]
@@ -12,7 +11,7 @@ if ! which $CLANG_FORMAT_DIFF &> /dev/null
 then
   echo "You didn't have clang-format-diff.py available in your computer!"
   echo "You can download it by running: "
-  echo "    curl https://fburl.com/clang-format-diff"
+  echo "    curl http://goo.gl/iUW1u2"
   exit 128
 fi
 
@@ -49,8 +48,22 @@ fi
 #   fi
 # fi
 
-# Check the format of recently changed lines,
-diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+set -e
+
+uncommitted_code=`git diff HEAD`
+
+# If there's no uncommitted changes, we assume user are doing post-commit
+# format check, in which case we'll check the modified lines from latest commit.
+# Otherwise, we'll check format of the uncommitted code only.
+format_last_commit=0
+if [ -z "$uncommitted_code" ]
+then
+  # Check the format of last commit
+  diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+else
+  # Check the format of uncommitted lines,
+  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+fi
 
 if [ -z "$diffs" ]
 then
@@ -81,3 +94,16 @@ fi
 
 # Do in-place format adjustment.
 git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
+echo "Files reformatted!"
+
+# Amend to last commit if user do the post-commit format check
+if [ -z "$uncommitted_code" ]; then
+  echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
+  read to_amend
+
+  if [ "$to_amend" == "y" ]
+  then
+    git commit -a --amend --reuse-message HEAD
+    echo "Amended to last commit"
+  fi
+fi

From 439e36db21ca3ac637fce484c39601f402f1056c Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Thu, 16 Jan 2014 18:44:23 -0800
Subject: [PATCH 24/27] Fix SlowdownAmount

Summary:
This had a few bugs.
1) bottom and top were reversed. top is for the max value but the callers were passing the max
value to bottom. The result is that the max sleep is used when n >= bottom.
2) one of the callers passed values with type double and these values are frequently between
1.0 and 2.0 so rounding will do some bad things
3) sometimes the function returned 0 when there should be a stall

With this change and one other diff (out for review soon) there are slightly fewer stalls on one workload.

With the fix.
Stalls(secs): 160.166 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 58.495 leveln_slowdown
Stalls(count): 910261 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 54526 leveln_slowdown

Without the fix.
Stalls(secs): 172.227 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 56.538 leveln_slowdown
Stalls(count): 160831 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 52845 leveln_slowdown

Task ID: #

Blame Rev:

Test Plan:
run db_bench for --benchmarks=overwrite with IO-bound database

Revert Plan:

Database Impact:

Memcache Impact:

Other Notes:

EImportant:

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Reviewers: haobo

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15243
---
 db/db_impl.cc | 8 ++++----
 db/db_impl.h  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index e84817b9b..0cafc269c 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3155,7 +3155,7 @@ void DBImpl::BuildBatchGroup(Writer** last_writer,
 // The goal of this formula is to gradually increase the rate at which writes
 // are slowed. We also tried linear delay (r * 1000), but it seemed to do
 // slightly worse. There is no other particular reason for choosing quadratic.
-uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
+uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
   uint64_t delay;
   if (n >= top) {
     delay = 1000;
@@ -3167,10 +3167,10 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
     // If we are here, we know that:
     //   level0_start_slowdown <= n < level0_slowdown
     // since the previous two conditions are false.
-    float how_much =
-      (float) (n - bottom) /
+    double how_much =
+      (double) (n - bottom) /
               (top - bottom);
-    delay = how_much * how_much * 1000;
+    delay = std::max(how_much * how_much * 1000, 100.0);
   }
   assert(delay <= 1000);
   return delay;
diff --git a/db/db_impl.h b/db/db_impl.h
index 214affac7..3eebaf4a7 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -293,7 +293,7 @@ class DBImpl : public DB {
   Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
                                 uint64_t* filenumber);
 
-  uint64_t SlowdownAmount(int n, int top, int bottom);
+  uint64_t SlowdownAmount(int n, double bottom, double top);
   // MakeRoomForWrite will return superversion_to_free through an arugment,
   // which the caller needs to delete. We do it because caller can delete
   // the superversion outside of mutex

From 0f4a75b710d616b08a06321ae29979d05cbf1f76 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 17 Jan 2014 12:02:03 -0800
Subject: [PATCH 25/27] Fix SIGSEGV in compaction picker

Summary:
The SIGSEGV was introduced by https://reviews.facebook.net/D15171

I also fixed ExpandWhileOverlapping() which returned the failure by setting it's own stack variable to nullptr (!). This bug is present in 2.6 release, so I guess ExpandWhileOverlapping never fails :)

Test Plan: `make check`. Also MarkCallaghan confirmed it fixed the SIGSEGV he reported.

Reviewers: MarkCallaghan, kailiu, sdong, dhruba, haobo

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15261
---
 db/compaction_picker.cc | 36 +++++++++++++++---------------------
 db/compaction_picker.h  | 12 +++++++++++-
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index fa2fbc663..8dd3c03bf 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -132,26 +132,16 @@ void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
   GetRange(all, smallest, largest);
 }
 
-// Add more files to the inputs on "level" to make sure that
-// no newer version of a key is compacted to "level+1" while leaving an older
-// version in a "level". Otherwise, any Get() will search "level" first,
-// and will likely return an old/stale value for the key, since it always
-// searches in increasing order of level to find the value. This could
-// also scramble the order of merge operands. This function should be
-// called any time a new Compaction is created, and its inputs_[0] are
-// populated.
-//
-// Will set c to nullptr if it is impossible to apply this compaction.
-void CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
+bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
   // If inputs are empty then there is nothing to expand.
   if (!c || c->inputs_[0].empty()) {
-    return;
+    return true;
   }
 
   // GetOverlappingInputs will always do the right thing for level-0.
   // So we don't need to do any expansion if level == 0.
   if (c->level() == 0) {
-    return;
+    return true;
   }
 
   const int level = c->level();
@@ -182,9 +172,9 @@ void CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
                                &parent_index))) {
     c->inputs_[0].clear();
     c->inputs_[1].clear();
-    delete c;
-    c = nullptr;
+    return false;
   }
+  return true;
 }
 
 uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
@@ -341,8 +331,8 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
                                  MaxGrandParentOverlapBytes(input_level));
 
   c->inputs_[0] = inputs;
-  ExpandWhileOverlapping(c);
-  if (c == nullptr) {
+  if (ExpandWhileOverlapping(c) == false) {
+    delete c;
     Log(options_->info_log, "Could not compact due to expansion failure.\n");
     return nullptr;
   }
@@ -383,8 +373,10 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version) {
     level = version->compaction_level_[i];
     if ((version->compaction_score_[i] >= 1)) {
       c = PickCompactionBySize(version, level, version->compaction_score_[i]);
-      ExpandWhileOverlapping(c);
-      if (c != nullptr) {
+      if (ExpandWhileOverlapping(c) == false) {
+        delete c;
+        c = nullptr;
+      } else {
         break;
       }
     }
@@ -408,7 +400,9 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version) {
         c->inputs_[0].push_back(f);
         c->parent_index_ = parent_index;
         c->input_version_->file_to_compact_ = nullptr;
-        ExpandWhileOverlapping(c);
+        if (ExpandWhileOverlapping(c) == false) {
+          return nullptr;
+        }
       }
     }
   }
@@ -528,7 +522,7 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
   }
 
   // store where to start the iteration in the next call to PickCompaction
-  c->input_version_->next_file_to_compact_by_size_[level] = nextIndex;
+  version->next_file_to_compact_by_size_[level] = nextIndex;
 
   return c;
 }
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 980c60013..0fe086a18 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -85,7 +85,17 @@ class CompactionPicker {
                 const std::vector<FileMetaData*>& inputs2,
                 InternalKey* smallest, InternalKey* largest);
 
-  void ExpandWhileOverlapping(Compaction* c);
+  // Add more files to the inputs on "level" to make sure that
+  // no newer version of a key is compacted to "level+1" while leaving an older
+  // version in a "level". Otherwise, any Get() will search "level" first,
+  // and will likely return an old/stale value for the key, since it always
+  // searches in increasing order of level to find the value. This could
+  // also scramble the order of merge operands. This function should be
+  // called any time a new Compaction is created, and its inputs_[0] are
+  // populated.
+  //
+  // Will return false if it is impossible to apply this compaction.
+  bool ExpandWhileOverlapping(Compaction* c);
 
   uint64_t ExpandedCompactionByteSizeLimit(int level);
 

From 83681bf9efa6b2d15c3d3115f1127a96bc932ce7 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 17 Jan 2014 12:46:06 -0800
Subject: [PATCH 26/27] Statistics code cleanup

Summary: I'm separating code-cleanup part of https://reviews.facebook.net/D14517. This will make D14517 easier to understand and this diff easier to review.

Test Plan: make check

Reviewers: haobo, kailiu, sdong, dhruba, tnovak

Reviewed By: tnovak

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15099
---
 db/compaction_picker.cc      |   7 +-
 db/db_bench.cc               |   9 +-
 db/db_impl.cc                |   8 +-
 db/db_statistics.cc          |  14 ----
 db/db_statistics.h           |  63 --------------
 db/db_test.cc                | 155 ++++++++++++++---------------------
 db/memtable.cc               |   2 +-
 db/merge_helper.cc           |   2 +-
 db/simple_table_db_test.cc   |   2 +-
 db/write_batch.cc            |   2 +-
 include/rocksdb/statistics.h |  45 +---------
 table/table_test.cc          |  21 ++---
 tools/db_stress.cc           |   2 +-
 util/histogram.cc            |  66 ++++++++-------
 util/histogram.h             |  18 ++--
 util/statistics.cc           |  43 ++++++++--
 util/statistics.h            |  53 ++++++++++++
 util/statistics_imp.h        |  32 --------
 util/stop_watch.h            |   8 +-
 19 files changed, 225 insertions(+), 327 deletions(-)
 delete mode 100644 db/db_statistics.cc
 delete mode 100644 db/db_statistics.h
 create mode 100644 util/statistics.h
 delete mode 100644 util/statistics_imp.h

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 8dd3c03bf..cfa3770d7 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/compaction_picker.h"
+#include "util/statistics.h"
 
 namespace rocksdb {
 
@@ -589,10 +590,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version) {
   }
 
   // update statistics
-  if (options_->statistics != nullptr) {
-    options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION,
-                                      c->inputs_[0].size());
-  }
+  MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
+              c->inputs_[0].size());
 
   // mark all the files that are being compacted
   c->MarkFilesBeingCompacted(true);
diff --git a/db/db_bench.cc b/db/db_bench.cc
index e0ba58281..e41a31cf3 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -14,7 +14,7 @@
 #include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "db/version_set.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/options.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
@@ -30,6 +30,7 @@
 #include "util/random.h"
 #include "util/stack_trace.h"
 #include "util/string_util.h"
+#include "util/statistics.h"
 #include "util/testutil.h"
 #include "hdfs/env_hdfs.h"
 #include "utilities/merge_operators.h"
@@ -355,9 +356,9 @@ static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
   return true;
 }
 
-static const bool FLAGS_compression_level_dummy =
-  google::RegisterFlagValidator(&FLAGS_compression_level,
-                                &ValidateCompressionLevel);
+static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_compression_level,
+                                  &ValidateCompressionLevel);
 
 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
              " from this level. Levels with number < min_level_to_compress are"
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 0cafc269c..a16f4479e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2564,9 +2564,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
   CompactionStats stats;
   stats.micros = env_->NowMicros() - start_micros - imm_micros;
-  if (options_.statistics.get()) {
-    options_.statistics.get()->measureTime(COMPACTION_TIME, stats.micros);
-  }
+  MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros);
   stats.files_in_leveln = compact->compaction->num_input_files(0);
   stats.files_in_levelnp1 = compact->compaction->num_input_files(1);
 
@@ -3062,8 +3060,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
           // have succeeded in memtable but Status reports error for all writes.
           throw std::runtime_error("In memory WriteBatch corruption!");
         }
-        SetTickerCount(options_.statistics.get(),
-                       SEQUENCE_NUMBER, last_sequence);
+        SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
+                       last_sequence);
       }
       if (updates == &tmp_batch_) tmp_batch_.Clear();
       mutex_.Lock();
diff --git a/db/db_statistics.cc b/db/db_statistics.cc
deleted file mode 100644
index f0cfd6740..000000000
--- a/db/db_statistics.cc
+++ /dev/null
@@ -1,14 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#include "db/db_statistics.h"
-
-namespace rocksdb {
-
-std::shared_ptr<Statistics> CreateDBStatistics() {
-  return std::make_shared<DBStatistics>();
-}
-
-} // namespace rocksdb
diff --git a/db/db_statistics.h b/db/db_statistics.h
deleted file mode 100644
index ec71e1688..000000000
--- a/db/db_statistics.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#include <cassert>
-#include <stdlib.h>
-#include <vector>
-#include <memory>
-
-#include "rocksdb/statistics.h"
-#include "util/histogram.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
-
-
-namespace rocksdb {
-
-class DBStatistics: public Statistics {
- public:
-  DBStatistics() : allTickers_(TICKER_ENUM_MAX),
-                   allHistograms_(HISTOGRAM_ENUM_MAX) { }
-
-  virtual ~DBStatistics() {}
-
-  virtual long getTickerCount(Tickers tickerType) {
-    assert(tickerType < TICKER_ENUM_MAX);
-    return allTickers_[tickerType].getCount();
-  }
-
-  virtual void setTickerCount(Tickers tickerType, uint64_t count) {
-    assert(tickerType < TICKER_ENUM_MAX);
-    allTickers_[tickerType].setTickerCount(count);
-  }
-
-  virtual void recordTick(Tickers tickerType, uint64_t count) {
-    assert(tickerType < TICKER_ENUM_MAX);
-    allTickers_[tickerType].recordTick(count);
-  }
-
-  virtual void measureTime(Histograms histogramType, uint64_t value) {
-    assert(histogramType < HISTOGRAM_ENUM_MAX);
-    allHistograms_[histogramType].Add(value);
-  }
-
-  virtual void histogramData(Histograms histogramType,
-                             HistogramData * const data) {
-    assert(histogramType < HISTOGRAM_ENUM_MAX);
-    allHistograms_[histogramType].Data(data);
-  }
-
-  std::vector<Ticker> allTickers_;
-  std::vector<HistogramImpl> allHistograms_;
-};
-
-std::shared_ptr<Statistics> CreateDBStatistics();
-
-} // namespace rocksdb
diff --git a/db/db_test.cc b/db/db_test.cc
index 9c8a97f93..133bcb5b4 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -17,7 +17,6 @@
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "db/db_statistics.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
@@ -27,6 +26,7 @@
 #include "util/mutexlock.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "util/statistics.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -677,6 +677,10 @@ static std::string Key(int i) {
   return std::string(buf);
 }
 
+static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+
 TEST(DBTest, Empty) {
   do {
     ASSERT_TRUE(db_ != nullptr);
@@ -710,14 +714,11 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   dbfull()->Flush(FlushOptions());
 
   // index/filter blocks added to block cache right after table creation.
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_INDEX_MISS));
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
   ASSERT_EQ(2, /* only index/filter were added */
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
-  ASSERT_EQ(0,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_DATA_MISS));
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
 
   // Make sure filter block is in cache.
   std::string value;
@@ -725,31 +726,24 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   db_->KeyMayExist(ReadOptions(), "key", &value);
 
   // Miss count should remain the same.
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
   db_->KeyMayExist(ReadOptions(), "key", &value);
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(2,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
   // Make sure index block is in cache.
-  auto index_block_hit =
-    options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
   value = Get("key");
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
   ASSERT_EQ(index_block_hit + 1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
   value = Get("key");
-  ASSERT_EQ(1,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
   ASSERT_EQ(index_block_hit + 2,
-            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 }
 
 TEST(DBTest, LevelLimitReopen) {
@@ -964,47 +958,39 @@ TEST(DBTest, KeyMayExist) {
     dbfull()->Flush(FlushOptions());
     value.clear();
 
-    long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    long cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found));
     ASSERT_TRUE(!value_found);
     // assert that no new files were opened and no new blocks were
     // read into block cache.
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     ASSERT_OK(db_->Delete(WriteOptions(), "a"));
 
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     dbfull()->Flush(FlushOptions());
     dbfull()->CompactRange(nullptr, nullptr);
 
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     ASSERT_OK(db_->Delete(WriteOptions(), "c"));
 
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value));
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     delete options.filter_policy;
   } while (ChangeOptions());
@@ -1037,9 +1023,8 @@ TEST(DBTest, NonBlockingIteration) {
 
     // verify that a non-blocking iterator does not find any
     // kvs. Neither does it do any IOs to storage.
-    long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    long cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     iter = db_->NewIterator(non_blocking_opts);
     count = 0;
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -1047,18 +1032,16 @@ TEST(DBTest, NonBlockingIteration) {
     }
     ASSERT_EQ(count, 0);
     ASSERT_TRUE(iter->status().IsIncomplete());
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
     delete iter;
 
     // read in the specified block via a regular get
     ASSERT_EQ(Get("a"), "b");
 
     // verify that we can find it via a non-blocking scan
-    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
-    cache_added =
-      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     iter = db_->NewIterator(non_blocking_opts);
     count = 0;
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -1066,9 +1049,8 @@ TEST(DBTest, NonBlockingIteration) {
       count++;
     }
     ASSERT_EQ(count, 1);
-    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
-    ASSERT_EQ(cache_added,
-              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
     delete iter;
 
   } while (ChangeOptions());
@@ -1273,12 +1255,10 @@ TEST(DBTest, IterReseek) {
   ASSERT_OK(Put("b",  "bone"));
   Iterator* iter = db_->NewIterator(ReadOptions());
   iter->SeekToFirst();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   ASSERT_EQ(IterStatus(iter), "a->two");
   iter->Next();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   ASSERT_EQ(IterStatus(iter), "b->bone");
   delete iter;
 
@@ -1289,8 +1269,7 @@ TEST(DBTest, IterReseek) {
   iter->SeekToFirst();
   ASSERT_EQ(IterStatus(iter), "a->three");
   iter->Next();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   ASSERT_EQ(IterStatus(iter), "b->bone");
   delete iter;
 
@@ -1300,30 +1279,28 @@ TEST(DBTest, IterReseek) {
   iter = db_->NewIterator(ReadOptions());
   iter->SeekToFirst();
   ASSERT_EQ(IterStatus(iter), "a->four");
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   iter->Next();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
   ASSERT_EQ(IterStatus(iter), "b->bone");
   delete iter;
 
   // Testing reverse iterator
   // At this point, we have three versions of "a" and one version of "b".
   // The reseek statistics is already at 1.
-  int num_reseeks = (int)options.statistics.get()->getTickerCount(
-                 NUMBER_OF_RESEEKS_IN_ITERATION);
+  int num_reseeks =
+      (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION);
 
   // Insert another version of b and assert that reseek is not invoked
   ASSERT_OK(Put("b",  "btwo"));
   iter = db_->NewIterator(ReadOptions());
   iter->SeekToLast();
   ASSERT_EQ(IterStatus(iter), "b->btwo");
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks);
   iter->Prev();
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 1);
   ASSERT_EQ(IterStatus(iter), "a->four");
   delete iter;
 
@@ -1334,13 +1311,13 @@ TEST(DBTest, IterReseek) {
   iter = db_->NewIterator(ReadOptions());
   iter->SeekToLast();
   ASSERT_EQ(IterStatus(iter), "b->bfour");
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 2);
   iter->Prev();
 
   // the previous Prev call should have invoked reseek
-  ASSERT_EQ(options.statistics.get()->getTickerCount(
-            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 3);
   ASSERT_EQ(IterStatus(iter), "a->four");
   delete iter;
 }
@@ -2103,24 +2080,18 @@ TEST(DBTest, CompressedCache) {
     switch (iter) {
       case 0:
         // only uncompressed block cache
-        ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
-                  0);
-        ASSERT_EQ(options.statistics.get()->getTickerCount
-                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
         break;
       case 1:
         // no block cache, only compressed cache
-        ASSERT_EQ(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
-                  0);
-        ASSERT_GT(options.statistics.get()->getTickerCount
-                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
         break;
       case 2:
         // both compressed and uncompressed block cache
-        ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
-                  0);
-        ASSERT_GT(options.statistics.get()->getTickerCount
-                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
         break;
       default:
         ASSERT_TRUE(false);
diff --git a/db/memtable.cc b/db/memtable.cc
index 7eb4eb165..91f4ed5d6 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -20,7 +20,7 @@
 #include "util/coding.h"
 #include "util/mutexlock.h"
 #include "util/murmurhash.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 
 namespace std {
 template <>
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index a7e2df0a3..e3f3adb1f 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -8,7 +8,7 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 #include <string>
 #include <stdio.h>
 
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index 555d31893..0f3b89d9b 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -17,7 +17,7 @@
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 7a6106afa..11c98ff0d 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -28,7 +28,7 @@
 #include "db/snapshot.h"
 #include "db/write_batch_internal.h"
 #include "util/coding.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 #include <stdexcept>
 
 namespace rocksdb {
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 011e510f5..f5fbb5924 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -242,53 +242,10 @@ struct HistogramData {
   double standard_deviation;
 };
 
-
-class Histogram {
- public:
-  // clear's the histogram
-  virtual void Clear() = 0;
-  virtual ~Histogram();
-  // Add a value to be recorded in the histogram.
-  virtual void Add(uint64_t value) = 0;
-
-  virtual std::string ToString() const = 0;
-
-  // Get statistics
-  virtual double Median() const = 0;
-  virtual double Percentile(double p) const = 0;
-  virtual double Average() const = 0;
-  virtual double StandardDeviation() const = 0;
-  virtual void Data(HistogramData * const data) const = 0;
-
-};
-
-/**
- * A dumb ticker which keeps incrementing through its life time.
- * Thread safe. Locking managed by implementation of this interface.
- */
-class Ticker {
- public:
-  Ticker() : count_(0) { }
-
-  inline void setTickerCount(uint64_t count) {
-    count_ = count;
-  }
-
-  inline void recordTick(int count = 1) {
-    count_ += count;
-  }
-
-  inline uint64_t getCount() {
-    return count_;
-  }
-
- private:
-  std::atomic_uint_fast64_t count_;
-};
-
 // Analyze the performance of a db
 class Statistics {
  public:
+  virtual ~Statistics() {}
 
   virtual long getTickerCount(Tickers tickerType) = 0;
   virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
diff --git a/table/table_test.cc b/table/table_test.cc
index d404e0b2a..9907550ce 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -12,7 +12,8 @@
 #include <vector>
 
 #include "db/dbformat.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
+#include "util/statistics.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/cache.h"
@@ -935,18 +936,12 @@ TEST(TableTest, NumBlockStat) {
 class BlockCacheProperties {
  public:
   explicit BlockCacheProperties(Statistics* statistics) {
-    block_cache_miss =
-      statistics->getTickerCount(BLOCK_CACHE_MISS);
-    block_cache_hit =
-      statistics->getTickerCount(BLOCK_CACHE_HIT);
-    index_block_cache_miss =
-      statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
-    index_block_cache_hit =
-      statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
-    data_block_cache_miss =
-      statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
-    data_block_cache_hit =
-      statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
+    block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
+    block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
+    index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
+    index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
+    data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
+    data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
   }
 
   // Check if the fetched props matches the expected ones.
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 966f007e8..8321c7eaf 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -26,7 +26,7 @@
 #include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "db/version_set.h"
-#include "db/db_statistics.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/cache.h"
 #include "utilities/utility_db.h"
 #include "rocksdb/env.h"
diff --git a/util/histogram.cc b/util/histogram.cc
index e83998014..968769cef 100644
--- a/util/histogram.cc
+++ b/util/histogram.cc
@@ -16,27 +16,38 @@
 
 namespace rocksdb {
 
-HistogramBucketMapper::HistogramBucketMapper() :
-  // Add newer bucket index here.
-  // Should be alwyas added in sorted order.
-  bucketValues_({
-  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45,
-  50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450,
-  500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000,
-  3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000,
-  16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000,
-  70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000,
-  250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000,
-  900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000,
-  3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000,
-  9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000,
-  25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000,
-  70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000,
-  180000000, 200000000, 250000000, 300000000, 350000000, 400000000,
-  450000000, 500000000, 600000000, 700000000, 800000000, 900000000,
-  1000000000}),
-  maxBucketValue_(bucketValues_.back()),
-  minBucketValue_(bucketValues_.front()) {
+HistogramBucketMapper::HistogramBucketMapper()
+    :
+      // Add newer bucket index here.
+      // Should be alwyas added in sorted order.
+      // If you change this, you also need to change
+      // size of array buckets_ in HistogramImpl
+      bucketValues_(
+          {1,         2,         3,         4,         5,         6,
+           7,         8,         9,         10,        12,        14,
+           16,        18,        20,        25,        30,        35,
+           40,        45,        50,        60,        70,        80,
+           90,        100,       120,       140,       160,       180,
+           200,       250,       300,       350,       400,       450,
+           500,       600,       700,       800,       900,       1000,
+           1200,      1400,      1600,      1800,      2000,      2500,
+           3000,      3500,      4000,      4500,      5000,      6000,
+           7000,      8000,      9000,      10000,     12000,     14000,
+           16000,     18000,     20000,     25000,     30000,     35000,
+           40000,     45000,     50000,     60000,     70000,     80000,
+           90000,     100000,    120000,    140000,    160000,    180000,
+           200000,    250000,    300000,    350000,    400000,    450000,
+           500000,    600000,    700000,    800000,    900000,    1000000,
+           1200000,   1400000,   1600000,   1800000,   2000000,   2500000,
+           3000000,   3500000,   4000000,   4500000,   5000000,   6000000,
+           7000000,   8000000,   9000000,   10000000,  12000000,  14000000,
+           16000000,  18000000,  20000000,  25000000,  30000000,  35000000,
+           40000000,  45000000,  50000000,  60000000,  70000000,  80000000,
+           90000000,  100000000, 120000000, 140000000, 160000000, 180000000,
+           200000000, 250000000, 300000000, 350000000, 400000000, 450000000,
+           500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}),
+      maxBucketValue_(bucketValues_.back()),
+      minBucketValue_(bucketValues_.front()) {
   for (size_t i =0; i < bucketValues_.size(); ++i) {
     valueIndexMap_[bucketValues_[i]] = i;
   }
@@ -62,24 +73,17 @@ namespace {
   const HistogramBucketMapper bucketMapper;
 }
 
-
-HistogramImpl::HistogramImpl() :
-  min_(bucketMapper.LastValue()),
-  max_(0),
-  num_(0),
-  sum_(0),
-  sum_squares_(0),
-  buckets_(std::vector<uint64_t>(bucketMapper.BucketCount(), 0)) {}
-
 void HistogramImpl::Clear() {
   min_ = bucketMapper.LastValue();
   max_ = 0;
   num_ = 0;
   sum_ = 0;
   sum_squares_ = 0;
-  buckets_.resize(bucketMapper.BucketCount(), 0);
+  memset(buckets_, 0, sizeof buckets_);
 }
 
+bool HistogramImpl::Empty() { return sum_squares_ == 0; }
+
 void HistogramImpl::Add(uint64_t value) {
   const size_t index = bucketMapper.IndexForValue(value);
   buckets_[index] += 1;
diff --git a/util/histogram.h b/util/histogram.h
index c01594da7..d95588dc2 100644
--- a/util/histogram.h
+++ b/util/histogram.h
@@ -52,9 +52,8 @@ class HistogramBucketMapper {
 
 class HistogramImpl {
  public:
-  HistogramImpl();
-  virtual ~HistogramImpl() {}
   virtual void Clear();
+  virtual bool Empty();
   virtual void Add(uint64_t value);
   void Merge(const HistogramImpl& other);
 
@@ -67,13 +66,14 @@ class HistogramImpl {
   virtual void Data(HistogramData * const data) const;
 
  private:
-  double min_;
-  double max_;
-  double num_;
-  double sum_;
-  double sum_squares_;
-  std::vector<uint64_t> buckets_;
-
+  // To be able to use HistogramImpl as thread local variable, its constructor
+  // has to be static. That's why we're using manually values from BucketMapper
+  double min_ = 1000000000;  // this is BucketMapper:LastValue()
+  double max_ = 0;
+  double num_ = 0;
+  double sum_ = 0;
+  double sum_squares_ = 0;
+  uint64_t buckets_[138] = {0};  // this is BucketMapper::BucketCount()
 };
 
 }  // namespace rocksdb
diff --git a/util/statistics.cc b/util/statistics.cc
index 5f7a5ba46..f19a777c1 100644
--- a/util/statistics.cc
+++ b/util/statistics.cc
@@ -3,12 +3,48 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#include "util/statistics.h"
 #include "rocksdb/statistics.h"
 #include <cstdio>
 
 namespace rocksdb {
 
+std::shared_ptr<Statistics> CreateDBStatistics() {
+  return std::make_shared<StatisticsImpl>();
+}
+
+StatisticsImpl::StatisticsImpl() {}
+
+StatisticsImpl::~StatisticsImpl() {}
+
+long StatisticsImpl::getTickerCount(Tickers tickerType) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  return tickers_[tickerType];
+}
+
+void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  tickers_[tickerType] = count;
+}
+
+void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  tickers_[tickerType] += count;
+}
+
+void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  histograms_[histogramType].Add(value);
+}
+
+void StatisticsImpl::histogramData(Histograms histogramType,
+                                   HistogramData* const data) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  histograms_[histogramType].Data(data);
+}
+
 namespace {
+
 // a buffer size used for temp string buffers
 const int kBufferSize = 200;
 
@@ -32,11 +68,8 @@ std::string HistogramToString (
   return std::string(buffer);
 };
 
-std::string TickerToString (
-    Statistics* dbstats,
-    const Tickers& ticker,
-    const std::string& name) {
-
+std::string TickerToString(Statistics* dbstats, const Tickers& ticker,
+                           const std::string& name) {
   char buffer[kBufferSize];
   snprintf(buffer, kBufferSize, "%s COUNT : %ld\n",
             name.c_str(), dbstats->getTickerCount(ticker));
diff --git a/util/statistics.h b/util/statistics.h
new file mode 100644
index 000000000..36456dddc
--- /dev/null
+++ b/util/statistics.h
@@ -0,0 +1,53 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/statistics.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+
+#define UNLIKELY(val) (__builtin_expect((val), 0))
+
+namespace rocksdb {
+
+class StatisticsImpl : public Statistics {
+ public:
+  StatisticsImpl();
+  virtual ~StatisticsImpl();
+
+  virtual long getTickerCount(Tickers tickerType);
+  virtual void setTickerCount(Tickers tickerType, uint64_t count);
+  virtual void recordTick(Tickers tickerType, uint64_t count);
+  virtual void measureTime(Histograms histogramType, uint64_t value);
+  virtual void histogramData(Histograms histogramType,
+                             HistogramData* const data);
+
+ private:
+  std::atomic_uint_fast64_t tickers_[TICKER_ENUM_MAX];
+  HistogramImpl histograms_[HISTOGRAM_ENUM_MAX];
+};
+
+// Utility functions
+inline void MeasureTime(Statistics* statistics, Histograms histogramType,
+                        uint64_t value) {
+  if (statistics) {
+    statistics->measureTime(histogramType, value);
+  }
+}
+
+inline void RecordTick(Statistics* statistics, Tickers ticker,
+                       uint64_t count = 1) {
+  if (statistics) {
+    statistics->recordTick(ticker, count);
+  }
+}
+
+inline void SetTickerCount(Statistics* statistics, Tickers ticker,
+                           uint64_t count) {
+  if (statistics) {
+    statistics->setTickerCount(ticker, count);
+  }
+}
+}
diff --git a/util/statistics_imp.h b/util/statistics_imp.h
deleted file mode 100644
index 0dc8884c1..000000000
--- a/util/statistics_imp.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-#pragma once
-#include "rocksdb/statistics.h"
-
-namespace rocksdb {
-
-// Utility functions
-inline void RecordTick(Statistics* statistics,
-                       Tickers ticker,
-                       uint64_t count = 1) {
-  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
-  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
-  if (statistics) {
-    statistics->recordTick(ticker, count);
-  }
-}
-
-inline void SetTickerCount(Statistics* statistics,
-                           Tickers ticker,
-                           uint64_t count) {
-  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
-  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
-  if (statistics) {
-    statistics->setTickerCount(ticker, count);
-  }
-}
-
-}
diff --git a/util/stop_watch.h b/util/stop_watch.h
index 6325a7440..48e1b01c2 100644
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -5,7 +5,7 @@
 //
 #pragma once
 #include "rocksdb/env.h"
-#include "util/statistics_imp.h"
+#include "util/statistics.h"
 
 namespace rocksdb {
 // Auto-scoped.
@@ -28,11 +28,7 @@ class StopWatch {
     return env_->NowMicros() - start_time_;
   }
 
-  ~StopWatch() {
-    if (statistics_) {
-      statistics_->measureTime(histogram_name_, ElapsedMicros());
-    }
-  }
+  ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); }
 
  private:
   Env* const env_;

From 4e8321bfeae8541fb5d827dfcb089e39078841bc Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Fri, 17 Jan 2014 16:38:54 -0800
Subject: [PATCH 27/27] Boost access before mutex is unlocked

Summary:
This moves the use of versions_ to before the mutex is unlocked
to avoid a possible race.

Task ID: #

Blame Rev:

Test Plan:
make check

Revert Plan:

Database Impact:

Memcache Impact:

Other Notes:

EImportant:

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Reviewers: haobo, dhruba

Reviewed By: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15279
---
 db/db_impl.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index a16f4479e..43f21505b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3200,15 +3200,15 @@ Status DBImpl::MakeRoomForWrite(bool force,
       // individual write by 0-1ms to reduce latency variance.  Also,
       // this delay hands over some CPU to the compaction thread in
       // case it is sharing the same core as the writer.
+      uint64_t slowdown =
+          SlowdownAmount(versions_->current()->NumLevelFiles(0),
+                         options_.level0_slowdown_writes_trigger,
+                         options_.level0_stop_writes_trigger);
       mutex_.Unlock();
       uint64_t delayed;
       {
         StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
-        env_->SleepForMicroseconds(
-          SlowdownAmount(versions_->current()->NumLevelFiles(0),
-                         options_.level0_slowdown_writes_trigger,
-                         options_.level0_stop_writes_trigger)
-        );
+        env_->SleepForMicroseconds(slowdown);
         delayed = sw.ElapsedMicros();
       }
       RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed);